From f9d80e0e366ac9b04e46a18efd2956069d94c36e Mon Sep 17 00:00:00 2001
From: Micah Elizabeth Scott <micah@scanlime.org>
Date: Sun, 20 Oct 2013 10:49:58 -0700
Subject: [PATCH] firmware: Use multiple variants of the inner loop

Now the firmware config flags actually select between several different inner-loops. This will be useful in the future if we want to support different configurations with runtime config options, and right now it's useful for side-by-side comparisons with the various config options.
---
 firmware/fadecandy.cpp    | 73 +++++++++++++++++++++++++++------
 firmware/fc_draw.cpp      | 18 ++++----
 firmware/fc_pixel.cpp     | 73 +++++++--------------------------
 firmware/fc_pixel_lut.cpp | 86 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 170 insertions(+), 80 deletions(-)
 create mode 100644 firmware/fc_pixel_lut.cpp

diff --git a/firmware/fadecandy.cpp b/firmware/fadecandy.cpp
index c985289..ea05fff 100644
--- a/firmware/fadecandy.cpp
+++ b/firmware/fadecandy.cpp
@@ -48,9 +48,51 @@ static residual_t residual[CHANNELS_TOTAL];
 // Reserved RAM area for signalling entry to bootloader
 extern uint32_t boot_token;
 
-// Low-level drawing code, which we want to compile in the same unit as the main loop
+/*
+ * Low-level drawing code, which we want to compile in the same unit as the main loop.
+ * We compile this multiple times, with different config flags.
+ */
+
+#define FCP_INTERPOLATION   0
+#define FCP_DITHERING       0
+#define FCP_FN(name)        name##_I0_D0
+#include "fc_pixel_lut.cpp"
+#include "fc_pixel.cpp"
+#include "fc_draw.cpp"
+#undef FCP_INTERPOLATION
+#undef FCP_DITHERING
+#undef FCP_FN
+
+#define FCP_INTERPOLATION   1
+#define FCP_DITHERING       0
+#define FCP_FN(name)        name##_I1_D0
+#include "fc_pixel_lut.cpp"
+#include "fc_pixel.cpp"
+#include "fc_draw.cpp"
+#undef FCP_INTERPOLATION
+#undef FCP_DITHERING
+#undef FCP_FN
+
+#define FCP_INTERPOLATION   0
+#define FCP_DITHERING       1
+#define FCP_FN(name)        name##_I0_D1
+#include "fc_pixel_lut.cpp"
 #include "fc_pixel.cpp"
 #include "fc_draw.cpp"
+#undef FCP_INTERPOLATION
+#undef FCP_DITHERING
+#undef FCP_FN
+
+#define FCP_INTERPOLATION   1
+#define FCP_DITHERING       1
+#define FCP_FN(name)        name##_I1_D1
+#include "fc_pixel_lut.cpp"
+#include "fc_pixel.cpp"
+#include "fc_draw.cpp"
+#undef FCP_INTERPOLATION
+#undef FCP_DITHERING
+#undef FCP_FN
+
 
 static inline uint32_t calculateInterpCoefficient()
 {
@@ -67,11 +109,6 @@ static inline uint32_t calculateInterpCoefficient()
      * how long the interpolation between those keyframes should take.
      */
 
-    if (buffers.flags & CFLAG_NO_INTERPOLATION) {
-        // Always use fbNext
-        return 0x10000;
-    }
-
     uint32_t now = millis();
     uint32_t tsPrev = buffers.fbPrev->timestamp;
     uint32_t tsNext = buffers.fbNext->timestamp;
@@ -113,15 +150,27 @@ extern "C" int main()
         watchdog_refresh();
 
         buffers.handleUSB();
-        updateDrawBuffer(calculateInterpCoefficient());
-        leds.show();
 
-        // Optionally disable dithering by clearing our residual buffer every frame.
-        if (buffers.flags & CFLAG_NO_DITHERING) {
-            for (unsigned i = 0; i < CHANNELS_TOTAL; ++i)
-                residual[i] = 0;
+        // Select a different drawing loop based on our firmware config flags
+        switch (buffers.flags & (CFLAG_NO_INTERPOLATION | CFLAG_NO_DITHERING)) {
+            case 0:
+            default:
+                updateDrawBuffer_I1_D1(calculateInterpCoefficient());
+                break;
+            case CFLAG_NO_INTERPOLATION:
+                updateDrawBuffer_I0_D1(0x10000);
+                break;
+            case CFLAG_NO_DITHERING:
+                updateDrawBuffer_I1_D0(calculateInterpCoefficient());
+                break;
+            case CFLAG_NO_INTERPOLATION | CFLAG_NO_DITHERING:
+                updateDrawBuffer_I0_D0(0x10000);
+                break;
         }
 
+        // Start sending the next frame over DMA
+        leds.show();
+
         // Performance counter, for monitoring frame rate externally
         perf_frameCounter++;
     }
diff --git a/firmware/fc_draw.cpp b/firmware/fc_draw.cpp
index 4d120ff..325149f 100644
--- a/firmware/fc_draw.cpp
+++ b/firmware/fc_draw.cpp
@@ -22,7 +22,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-static void updateDrawBuffer(unsigned interpCoefficient)
+static void FCP_FN(updateDrawBuffer)(unsigned interpCoefficient)
 {
     /*
      * Update the LED draw buffer. In one step, we do the interpolation,
@@ -77,7 +77,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
          * This generates compact and efficient code using the BFI instruction.
          */
 
-        uint32_t p0 = updatePixel(icPrev, icNext,
+        uint32_t p0 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 0),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 0),
             pResidual + LEDS_PER_STRIP * 3 * 0);
@@ -107,7 +107,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p0b = p0 >> 22;
         o0.p0a = p0 >> 23;
 
-        uint32_t p1 = updatePixel(icPrev, icNext,
+        uint32_t p1 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 1),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 1),
             pResidual + LEDS_PER_STRIP * 3 * 1);
@@ -137,7 +137,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p1b = p1 >> 22;
         o0.p1a = p1 >> 23;
 
-        uint32_t p2 = updatePixel(icPrev, icNext,
+        uint32_t p2 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 2),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 2),
             pResidual + LEDS_PER_STRIP * 3 * 2);
@@ -167,7 +167,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p2b = p2 >> 22;
         o0.p2a = p2 >> 23;
 
-        uint32_t p3 = updatePixel(icPrev, icNext,
+        uint32_t p3 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 3),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 3),
             pResidual + LEDS_PER_STRIP * 3 * 3);
@@ -197,7 +197,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p3b = p3 >> 22;
         o0.p3a = p3 >> 23;
 
-        uint32_t p4 = updatePixel(icPrev, icNext,
+        uint32_t p4 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 4),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 4),
             pResidual + LEDS_PER_STRIP * 3 * 4);
@@ -227,7 +227,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p4b = p4 >> 22;
         o0.p4a = p4 >> 23;
 
-        uint32_t p5 = updatePixel(icPrev, icNext,
+        uint32_t p5 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 5),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 5),
             pResidual + LEDS_PER_STRIP * 3 * 5);
@@ -257,7 +257,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p5b = p5 >> 22;
         o0.p5a = p5 >> 23;
 
-        uint32_t p6 = updatePixel(icPrev, icNext,
+        uint32_t p6 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 6),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 6),
             pResidual + LEDS_PER_STRIP * 3 * 6);
@@ -287,7 +287,7 @@ static void updateDrawBuffer(unsigned interpCoefficient)
         o0.p6b = p6 >> 22;
         o0.p6a = p6 >> 23;
 
-        uint32_t p7 = updatePixel(icPrev, icNext,
+        uint32_t p7 = FCP_FN(updatePixel)(icPrev, icNext,
             buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 7),
             buffers.fbNext->pixel(i + LEDS_PER_STRIP * 7),
             pResidual + LEDS_PER_STRIP * 3 * 7);
diff --git a/firmware/fc_pixel.cpp b/firmware/fc_pixel.cpp
index 12b2f8c..209b2f3 100644
--- a/firmware/fc_pixel.cpp
+++ b/firmware/fc_pixel.cpp
@@ -22,62 +22,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-
-ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_t arg)
-{
-    /*
-     * Using our color LUT for the indicated channel, convert the
-     * 16-bit intensity "arg" in our input colorspace to a corresponding
-     * 16-bit intensity in the device colorspace.
-     *
-     * Remember that our LUT is 257 entries long. The final entry corresponds to an
-     * input of 0x10000, which can't quite be reached.
-     *
-     * 'arg' is in the range [0, 0xFFFF]
-     *
-     * This operation is equivalent to the following:
-     *
-     *      unsigned index = arg >> 8;          // Range [0, 0xFF]
-     *      unsigned alpha = arg & 0xFF;        // Range [0, 0xFF]
-     *      unsigned invAlpha = 0x100 - alpha;  // Range [1, 0x100]
-     *
-     *      // Result in range [0, 0xFFFF]
-     *      return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8;
-     *
-     * This is easy to understand, but it turns out to be a serious bottleneck
-     * in terms of speed and memory bandwidth, as well as register pressure that
-     * affects the compilation of updatePixel().
-     *
-     * To speed this up, we try and do the lut[index] and lut[index+1] portions
-     * in parallel using the SMUAD instruction. This is a pair of 16x16 multiplies,
-     * and the results are added together. We can combine this with an unaligned load
-     * to grab two adjacent entries from the LUT. The remaining complications are:
-     *
-     *   1. We wanted unsigned, not signed
-     *   2. We still need to generate the input values efficiently.
-     *
-     * (1) is easy to solve if we're okay with 15-bit precision for the LUT instead
-     * of 16-bit, which is fine. During LUT preparation, we right-shift each entry
-     * by 1, keeping them within the positive range of a signed 16-bit int. 
-     *
-     * For (2), we need to quickly put 'alpha' in the high halfword and invAlpha in
-     * the low halfword, or vice versa. One fast way to do this is (0x01000000 + x - (x << 16).
-     */
-
-    uint32_t index = arg >> 8;          // Range [0, 0xFF]
-
-    // Load lut[index] into low halfword, lut[index+1] into high halfword.
-    uint32_t pair = *(const uint32_t*)(lut + index);
-
-    unsigned alpha = arg & 0xFF;        // Range [0, 0xFF]
-
-    // Reversed halfword order
-    uint32_t pairAlpha = (0x01000000 + alpha - (alpha << 16));
-
-    return __SMUADX(pairAlpha, pair) >> 7;
-}
-
-static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext,
+static uint32_t FCP_FN(updatePixel)(uint32_t icPrev, uint32_t icNext,
     const uint8_t *pixelPrev, const uint8_t *pixelNext, residual_t *pResidual)
 {
     /*
@@ -92,22 +37,30 @@ static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext,
      * icPrev + icNext = 0x1010000
      */
 
+#if FCP_INTERPOLATION
     // Per-channel linear interpolation and conversion to 16-bit color.
     // Result range: [0, 0xFFFF] 
     int iR = (pixelPrev[0] * icPrev + pixelNext[0] * icNext) >> 16;
     int iG = (pixelPrev[1] * icPrev + pixelNext[1] * icNext) >> 16;
     int iB = (pixelPrev[2] * icPrev + pixelNext[2] * icNext) >> 16;
+#else
+    int iR = pixelNext[0] * 0x101;
+    int iG = pixelNext[1] * 0x101;
+    int iB = pixelNext[2] * 0x101;
+#endif
 
     // Pass through our color LUT
     // Result range: [0, 0xFFFF] 
-    iR = lutInterpolate(buffers.lutCurrent.r, iR);
-    iG = lutInterpolate(buffers.lutCurrent.g, iG);
-    iB = lutInterpolate(buffers.lutCurrent.b, iB);
+    iR = FCP_FN(lutInterpolate)(buffers.lutCurrent.r, iR);
+    iG = FCP_FN(lutInterpolate)(buffers.lutCurrent.g, iG);
+    iB = FCP_FN(lutInterpolate)(buffers.lutCurrent.b, iB);
 
+#if FCP_DITHERING
     // Incorporate the residual from last frame
     iR += pResidual[0];
     iG += pResidual[1];
     iB += pResidual[2];
+#endif
 
     /*
      * Round to the nearest 8-bit value. Clamping is necessary!
@@ -124,10 +77,12 @@ static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext,
     int g8 = __USAT(iG + 0x80, 16) >> 8;
     int b8 = __USAT(iB + 0x80, 16) >> 8;
 
+#if FCP_DITHERING
     // Compute the error, after expanding the 8-bit value back to 16-bit.
     pResidual[0] = iR - (r8 * 257);
     pResidual[1] = iG - (g8 * 257);
     pResidual[2] = iB - (b8 * 257);
+#endif
 
     // Pack the result, in GRB order.
     return (g8 << 16) | (r8 << 8) | b8;
diff --git a/firmware/fc_pixel_lut.cpp b/firmware/fc_pixel_lut.cpp
new file mode 100644
index 0000000..daa260f
--- /dev/null
+++ b/firmware/fc_pixel_lut.cpp
@@ -0,0 +1,86 @@
+/*
+ * Fadecandy Firmware: Low-level per-pixel LUT code
+ * (Included into fadecandy.cpp)
+ * 
+ * Copyright (c) 2013 Micah Elizabeth Scott
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+ALWAYS_INLINE static inline
+uint32_t FCP_FN(lutInterpolate)(const uint16_t *lut, uint32_t arg)
+{
+    /*
+     * Using our color LUT for the indicated channel, convert the
+     * 16-bit intensity "arg" in our input colorspace to a corresponding
+     * 16-bit intensity in the device colorspace.
+     *
+     * Remember that our LUT is 257 entries long. The final entry corresponds to an
+     * input of 0x10000, which can't quite be reached.
+     *
+     * 'arg' is in the range [0, 0xFFFF]
+     *
+     * This operation is equivalent to the following:
+     *
+     *      unsigned index = arg >> 8;          // Range [0, 0xFF]
+     *      unsigned alpha = arg & 0xFF;        // Range [0, 0xFF]
+     *      unsigned invAlpha = 0x100 - alpha;  // Range [1, 0x100]
+     *
+     *      // Result in range [0, 0xFFFF]
+     *      return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8;
+     *
+     * This is easy to understand, but it turns out to be a serious bottleneck
+     * in terms of speed and memory bandwidth, as well as register pressure that
+     * affects the compilation of updatePixel().
+     *
+     * To speed this up, we try and do the lut[index] and lut[index+1] portions
+     * in parallel using the SMUAD instruction. This is a pair of 16x16 multiplies,
+     * and the results are added together. We can combine this with an unaligned load
+     * to grab two adjacent entries from the LUT. The remaining complications are:
+     *
+     *   1. We wanted unsigned, not signed
+     *   2. We still need to generate the input values efficiently.
+     *
+     * (1) is easy to solve if we're okay with 15-bit precision for the LUT instead
+     * of 16-bit, which is fine. During LUT preparation, we right-shift each entry
+     * by 1, keeping them within the positive range of a signed 16-bit int. 
+     *
+     * For (2), we need to quickly put 'alpha' in the high halfword and invAlpha in
+     * the low halfword, or vice versa. One fast way to do this is (0x01000000 + x - (x << 16).
+     */
+
+#if FCP_INTERPOLATION
+
+    uint32_t index = arg >> 8;          // Range [0, 0xFF]
+
+    // Load lut[index] into low halfword, lut[index+1] into high halfword.
+    uint32_t pair = *(const uint32_t*)(lut + index);
+
+    unsigned alpha = arg & 0xFF;        // Range [0, 0xFF]
+
+    // Reversed halfword order
+    uint32_t pairAlpha = (0x01000000 + alpha - (alpha << 16));
+
+    return __SMUADX(pairAlpha, pair) >> 7;
+
+#else
+    // Simpler non-interpolated version
+    return lut[arg >> 8] << 1;
+#endif
+}
-- 
GitLab