From 5a610f4dbadd3c8f1f667c8ea1e49d2d757a1fa1 Mon Sep 17 00:00:00 2001
From: Micah Elizabeth Scott <micah@scanlime.org>
Date: Mon, 22 Jul 2013 20:57:53 -0700
Subject: [PATCH] Optimize and debug dithering and clamping code

---
 firmware/Makefile      |  2 +-
 firmware/fadecandy.cpp | 38 +++++++++++++++++++++++++++-----------
 firmware/fc_usb.h      |  4 ++--
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/firmware/Makefile b/firmware/Makefile
index bfe6eb8..9c54d50 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -36,7 +36,7 @@ CPP_FILES = \
 INCLUDES = -I.
 
 # CPPFLAGS = compiler options for C and C++
-CPPFLAGS = -Wall -g -Os -mcpu=cortex-m4 -mthumb -nostdlib -MMD $(OPTIONS) $(INCLUDES)
+CPPFLAGS = -Wall -Wno-sign-compare -Wno-strict-aliasing -g -Os -mcpu=cortex-m4 -mthumb -nostdlib -MMD $(OPTIONS) $(INCLUDES)
 
 # compiler options for C++ only
 CXXFLAGS = -std=gnu++0x -felide-constructors -fno-exceptions -fno-rtti
diff --git a/firmware/fadecandy.cpp b/firmware/fadecandy.cpp
index c896fbd..7638bbe 100644
--- a/firmware/fadecandy.cpp
+++ b/firmware/fadecandy.cpp
@@ -24,6 +24,7 @@
 #include <math.h>
 #include <algorithm>
 #include "OctoWS2811z.h"
+#include "arm_math.h"
 #include "fc_usb.h"
 #include "fc_defs.h"
 
@@ -38,7 +39,7 @@ static OctoWS2811z leds(LEDS_PER_STRIP, ledBuffer, WS2811_800kHz);
 static int8_t residual[CHANNELS_TOTAL];
 
 
-static uint32_t ALWAYS_INLINE lutInterpolate(const uint16_t *lut, uint32_t arg)
+ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_t arg)
 {
     /*
      * Using our color LUT for the indicated channel, convert the
@@ -53,7 +54,7 @@ static uint32_t ALWAYS_INLINE lutInterpolate(const uint16_t *lut, uint32_t arg)
     return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8;
 }
 
-static uint32_t ALWAYS_INLINE updatePixel(uint32_t icPrev, uint32_t icNext, unsigned n)
+ALWAYS_INLINE static inline uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, unsigned n)
 {
     /*
      * Update pipeline for one pixel:
@@ -67,9 +68,9 @@ static uint32_t ALWAYS_INLINE updatePixel(uint32_t icPrev, uint32_t icNext, unsi
     const uint8_t *pixelNext = buffers.fbNext->pixel(n);
 
     // Per-channel linear interpolation and conversion to 16-bit color.
-    uint32_t iR = (pixelPrev[0] * icPrev + pixelNext[0] * icNext) >> 16;
-    uint32_t iG = (pixelPrev[1] * icPrev + pixelNext[1] * icNext) >> 16;
-    uint32_t iB = (pixelPrev[2] * icPrev + pixelNext[2] * icNext) >> 16;
+    int iR = (pixelPrev[0] * icPrev + pixelNext[0] * icNext) >> 16;
+    int iG = (pixelPrev[1] * icPrev + pixelNext[1] * icNext) >> 16;
+    int iB = (pixelPrev[2] * icPrev + pixelNext[2] * icNext) >> 16;
 
     // Pass through our color LUT
     iR = lutInterpolate(&buffers.lutCurrent[0 * 256], iR);
@@ -84,12 +85,27 @@ static uint32_t ALWAYS_INLINE updatePixel(uint32_t icPrev, uint32_t icNext, unsi
     iG += pResidual[1];
     iB += pResidual[2];
 
-    // Round to the nearest 8-bit value
-    int r8 = std::min<int>(0xff, std::max<int>(0, (iR + 0x80) >> 8));
-    int g8 = std::min<int>(0xff, std::max<int>(0, (iG + 0x80) >> 8));
-    int b8 = std::min<int>(0xff, std::max<int>(0, (iB + 0x80) >> 8));
+    /*
+     * Round to the nearest 8-bit value. Clamping is necessary!
+     * This value might be as low as -128 prior to adding 0x80
+     * for rounding. After this addition, the result is guaranteed
+     * to be >= 0, but it may be over 0xffff.
+     *
+     * This rules out clamping using the UQADD16 instruction,
+     * since the addition itself needs to allow overflow. Instead,
+     * we clamp using a separate USAT instruction.
+     */
+
+    int r8 = __USAT(iR + 0x80, 16) >> 8;
+    int g8 = __USAT(iG + 0x80, 16) >> 8;
+    int b8 = __USAT(iB + 0x80, 16) >> 8;
+
+    /*
+     * Compute the error, after expanding the 8-bit value back to 16-bit.
+     * Clamping (e.g. via __SSAT) is not necessary, since the error will not
+     * be greater than +/- 127.
+     */
 
-    // Compute the error, after expanding the 8-bit value back to 16-bit.
     pResidual[0] = iR - (r8 * 257);
     pResidual[1] = iG - (g8 * 257);
     pResidual[2] = iB - (b8 * 257);
@@ -353,7 +369,7 @@ extern "C" int main()
     while (1) {
         buffers.handleUSB();
 
-        updateDrawBuffer((millis() << 6) & 0xFFFF);
+        updateDrawBuffer((millis() << 4) & 0xFFFF);
         leds.show();
     }
 }
diff --git a/firmware/fc_usb.h b/firmware/fc_usb.h
index 72bcfcc..ba9a9d3 100644
--- a/firmware/fc_usb.h
+++ b/firmware/fc_usb.h
@@ -68,7 +68,7 @@ struct fcPacketBuffer
 
 struct fcFramebuffer : public fcPacketBuffer<PACKETS_PER_FRAME>
 {
-    const uint8_t* ALWAYS_INLINE pixel(unsigned index)
+    ALWAYS_INLINE const uint8_t* pixel(unsigned index)
     {
         return &packets[index / PIXELS_PER_PACKET]->buf[1 + (index % PIXELS_PER_PACKET) * 3];
     }
@@ -81,7 +81,7 @@ struct fcFramebuffer : public fcPacketBuffer<PACKETS_PER_FRAME>
 
 struct fcColorLUT : public fcPacketBuffer<PACKETS_PER_LUT>
 {
-    const unsigned ALWAYS_INLINE entry(unsigned index)
+    ALWAYS_INLINE const unsigned entry(unsigned index)
     {
         const uint8_t *p = &packets[index / LUTENTRIES_PER_PACKET]->buf[2 + (index % LUTENTRIES_PER_PACKET) * 2];
         return *(uint16_t*)p;
-- 
GitLab