From 7017969f6bd92b8ffecc05fa6f9ec1ed24c78f9b Mon Sep 17 00:00:00 2001
From: Micah Elizabeth Scott <micah@scanlime.org>
Date: Tue, 23 Jul 2013 15:30:44 -0700
Subject: [PATCH] Revert "Framebuffer iterator scheme, to avoid division"

This reverts commit 07106959f6340912fc47c94ee40cc072479ec44e.
---
 firmware/fadecandy.cpp | 79 ++++++++++--------------------------------
 firmware/fc_usb.h      | 40 ++-------------------
 2 files changed, 21 insertions(+), 98 deletions(-)

diff --git a/firmware/fadecandy.cpp b/firmware/fadecandy.cpp
index 7e7df52..3eaa5ec 100644
--- a/firmware/fadecandy.cpp
+++ b/firmware/fadecandy.cpp
@@ -54,7 +54,7 @@ ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_
     return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8;
 }
 
-static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, fcFramebufferIter iter)
+static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, unsigned n)
 {
     /*
      * Update pipeline for one pixel:
@@ -64,8 +64,8 @@ static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, fcFramebuff
      *    3. Dithering
      */
 
-    const uint8_t *pixelPrev = buffers.fbPrev->pixel(iter);
-    const uint8_t *pixelNext = buffers.fbNext->pixel(iter);
+    const uint8_t *pixelPrev = buffers.fbPrev->pixel(n);
+    const uint8_t *pixelNext = buffers.fbNext->pixel(n);
 
     // Per-channel linear interpolation and conversion to 16-bit color.
     int iR = (pixelPrev[0] * icPrev + pixelNext[0] * icNext) >> 16;
@@ -78,7 +78,7 @@ static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, fcFramebuff
     iB = lutInterpolate(&buffers.lutCurrent[2 * 256], iB);
 
     // Pointer to the residual buffer for this pixel
-    int8_t *pResidual = &residual[iter.component];
+    int8_t *pResidual = &residual[n * 3];
 
     // Incorporate the residual from last frame
     iR += pResidual[0];
@@ -133,30 +133,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
     uint32_t icPrev = 257 * (0x10000 - interpCoefficient);
     uint32_t icNext = 257 * interpCoefficient;
 
-    /*
-     * Iterators for each LED strip. (Avoids division later on).
-     *
-     * We set these to one LED prior to the one we actually want to start with,
-     * since it makes for more efficient register usage later on if we pre-increment
-     * instead of post-incrementing.
-     */
-
-    fcFramebufferIter i0, i1, i2, i3, i4, i5, i6, i7;
-
-    i0.setNegativeOne();
-    i1.set(LEDS_PER_STRIP * 1 - 1);
-    i2.set(LEDS_PER_STRIP * 2 - 1);
-    i3.set(LEDS_PER_STRIP * 3 - 1);
-    i4.set(LEDS_PER_STRIP * 4 - 1);
-    i5.set(LEDS_PER_STRIP * 5 - 1);
-    i6.set(LEDS_PER_STRIP * 6 - 1);
-    i7.set(LEDS_PER_STRIP * 7 - 1);
-
-    /*
-     * Process each LED strip in parallel, and re-pack into a planar DMA buffer.
-     */
-
-    for (int i = 0; i < LEDS_PER_STRIP; ++i, out += 6) {
+    for (int i = 0; i < LEDS_PER_STRIP; ++i) {
 
         // Six output words
         union {
@@ -174,8 +151,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
          * This generates fairly efficient code using the UBFX and BFI instructions.
          */
 
-        i0.next();
-        uint32_t p0 = updatePixel(icPrev, icNext, i0);
+        uint32_t p0 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p0d = p0;
         o5.p0c = p0 >> 1;
@@ -202,8 +178,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p0b = p0 >> 22;
         o0.p0a = p0 >> 23;
 
-        i1.next();
-        uint32_t p1 = updatePixel(icPrev, icNext, i1);
+        uint32_t p1 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p1d = p1;
         o5.p1c = p1 >> 1;
@@ -230,8 +205,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p1b = p1 >> 22;
         o0.p1a = p1 >> 23;
 
-        i2.next();
-        uint32_t p2 = updatePixel(icPrev, icNext, i2);
+        uint32_t p2 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p2d = p2;
         o5.p2c = p2 >> 1;
@@ -258,8 +232,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p2b = p2 >> 22;
         o0.p2a = p2 >> 23;
 
-        i3.next();
-        uint32_t p3 = updatePixel(icPrev, icNext, i3);
+        uint32_t p3 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p3d = p3;
         o5.p3c = p3 >> 1;
@@ -286,8 +259,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p3b = p3 >> 22;
         o0.p3a = p3 >> 23;
 
-        i4.next();
-        uint32_t p4 = updatePixel(icPrev, icNext, i4);
+        uint32_t p4 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p4d = p4;
         o5.p4c = p4 >> 1;
@@ -314,8 +286,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p4b = p4 >> 22;
         o0.p4a = p4 >> 23;
 
-        i5.next();
-        uint32_t p5 = updatePixel(icPrev, icNext, i5);
+        uint32_t p5 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p5d = p5;
         o5.p5c = p5 >> 1;
@@ -342,8 +313,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p5b = p5 >> 22;
         o0.p5a = p5 >> 23;
 
-        i6.next();
-        uint32_t p6 = updatePixel(icPrev, icNext, i6);
+        uint32_t p6 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p6d = p6;
         o5.p6c = p6 >> 1;
@@ -370,50 +340,39 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p6b = p6 >> 22;
         o0.p6a = p6 >> 23;
 
-        i7.next();
-        uint32_t p7 = updatePixel(icPrev, icNext, i7);
+        uint32_t p7 = updatePixel(icPrev, icNext, i + LEDS_PER_STRIP * 0);
 
         o5.p7d = p7;
         o5.p7c = p7 >> 1;
         o5.p7b = p7 >> 2;
         o5.p7a = p7 >> 3;
-
-        out[5] = o5.word;
-
         o4.p7d = p7 >> 4;
         o4.p7c = p7 >> 5;
         o4.p7b = p7 >> 6;
         o4.p7a = p7 >> 7;
-
-        out[4] = o4.word;
-
         o3.p7d = p7 >> 8;
         o3.p7c = p7 >> 9;
         o3.p7b = p7 >> 10;
         o3.p7a = p7 >> 11;
-
-        out[3] = o3.word;
-
         o2.p7d = p7 >> 12;
         o2.p7c = p7 >> 13;
         o2.p7b = p7 >> 14;
         o2.p7a = p7 >> 15;
-
-        out[2] = o2.word;
-
         o1.p7d = p7 >> 16;
         o1.p7c = p7 >> 17;
         o1.p7b = p7 >> 18;
         o1.p7a = p7 >> 19;
-
-        out[1] = o1.word;
-
         o0.p7d = p7 >> 20;
         o0.p7c = p7 >> 21;
         o0.p7b = p7 >> 22;
         o0.p7a = p7 >> 23;
 
-        out[0] = o0.word;
+        *(out++) = o0.word;
+        *(out++) = o1.word;
+        *(out++) = o2.word;
+        *(out++) = o3.word;
+        *(out++) = o4.word;
+        *(out++) = o5.word;
     }
 }
 
diff --git a/firmware/fc_usb.h b/firmware/fc_usb.h
index ffe47f5..ba9a9d3 100644
--- a/firmware/fc_usb.h
+++ b/firmware/fc_usb.h
@@ -62,51 +62,15 @@ struct fcPacketBuffer
 };
 
 
-/*
- * Iterator referring to one pixel in the framebuffer
- */
-
-struct fcFramebufferIter
-{
-    unsigned packet;
-    unsigned index;
-    unsigned component;
-
-    ALWAYS_INLINE void setNegativeOne()
-    {
-        // Special case: After one call to next(), this points to LED #0.
-        index = -1;
-        packet = 0;
-        component = -3;
-    }
-
-    ALWAYS_INLINE void set(unsigned led)
-    {
-        packet = led / PIXELS_PER_PACKET;
-        index = 1 + (led % PIXELS_PER_PACKET) * 3;
-        component = led * 3;
-    }
-
-    ALWAYS_INLINE void next()
-    {
-        if (++index >= (1 + PIXELS_PER_PACKET * 3)) {
-            index = 0;
-            packet++;
-        }
-        component += 3;
-    }
-};
-
-
 /*
  * Framebuffer
  */
 
 struct fcFramebuffer : public fcPacketBuffer<PACKETS_PER_FRAME>
 {
-    ALWAYS_INLINE const uint8_t* pixel(fcFramebufferIter iter) const
+    ALWAYS_INLINE const uint8_t* pixel(unsigned index)
     {
-        return &packets[iter.packet]->buf[iter.index];
+        return &packets[index / PIXELS_PER_PACKET]->buf[1 + (index % PIXELS_PER_PACKET) * 3];
     }
 };
 
-- 
GitLab