From f9d80e0e366ac9b04e46a18efd2956069d94c36e Mon Sep 17 00:00:00 2001 From: Micah Elizabeth Scott <micah@scanlime.org> Date: Sun, 20 Oct 2013 10:49:58 -0700 Subject: [PATCH] firmware: Use multiple variants of the inner loop Now the firmware config flags actually select between several different inner-loops. This will be useful in the future if we want to support different configurations with runtime config options, and right now it's useful for side-by-side comparisons with the various config options. --- firmware/fadecandy.cpp | 73 +++++++++++++++++++++++++++------ firmware/fc_draw.cpp | 18 ++++---- firmware/fc_pixel.cpp | 73 +++++++-------------------------- firmware/fc_pixel_lut.cpp | 86 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 170 insertions(+), 80 deletions(-) create mode 100644 firmware/fc_pixel_lut.cpp diff --git a/firmware/fadecandy.cpp b/firmware/fadecandy.cpp index c985289..ea05fff 100644 --- a/firmware/fadecandy.cpp +++ b/firmware/fadecandy.cpp @@ -48,9 +48,51 @@ static residual_t residual[CHANNELS_TOTAL]; // Reserved RAM area for signalling entry to bootloader extern uint32_t boot_token; -// Low-level drawing code, which we want to compile in the same unit as the main loop +/* + * Low-level drawing code, which we want to compile in the same unit as the main loop. + * We compile this multiple times, with different config flags. + */ + +#define FCP_INTERPOLATION 0 +#define FCP_DITHERING 0 +#define FCP_FN(name) name##_I0_D0 +#include "fc_pixel_lut.cpp" +#include "fc_pixel.cpp" +#include "fc_draw.cpp" +#undef FCP_INTERPOLATION +#undef FCP_DITHERING +#undef FCP_FN + +#define FCP_INTERPOLATION 1 +#define FCP_DITHERING 0 +#define FCP_FN(name) name##_I1_D0 +#include "fc_pixel_lut.cpp" +#include "fc_pixel.cpp" +#include "fc_draw.cpp" +#undef FCP_INTERPOLATION +#undef FCP_DITHERING +#undef FCP_FN + +#define FCP_INTERPOLATION 0 +#define FCP_DITHERING 1 +#define FCP_FN(name) name##_I0_D1 +#include "fc_pixel_lut.cpp" #include "fc_pixel.cpp" #include "fc_draw.cpp" +#undef FCP_INTERPOLATION +#undef FCP_DITHERING +#undef FCP_FN + +#define FCP_INTERPOLATION 1 +#define FCP_DITHERING 1 +#define FCP_FN(name) name##_I1_D1 +#include "fc_pixel_lut.cpp" +#include "fc_pixel.cpp" +#include "fc_draw.cpp" +#undef FCP_INTERPOLATION +#undef FCP_DITHERING +#undef FCP_FN + static inline uint32_t calculateInterpCoefficient() { @@ -67,11 +109,6 @@ static inline uint32_t calculateInterpCoefficient() * how long the interpolation between those keyframes should take. */ - if (buffers.flags & CFLAG_NO_INTERPOLATION) { - // Always use fbNext - return 0x10000; - } - uint32_t now = millis(); uint32_t tsPrev = buffers.fbPrev->timestamp; uint32_t tsNext = buffers.fbNext->timestamp; @@ -113,15 +150,27 @@ extern "C" int main() watchdog_refresh(); buffers.handleUSB(); - updateDrawBuffer(calculateInterpCoefficient()); - leds.show(); - // Optionally disable dithering by clearing our residual buffer every frame. - if (buffers.flags & CFLAG_NO_DITHERING) { - for (unsigned i = 0; i < CHANNELS_TOTAL; ++i) - residual[i] = 0; + // Select a different drawing loop based on our firmware config flags + switch (buffers.flags & (CFLAG_NO_INTERPOLATION | CFLAG_NO_DITHERING)) { + case 0: + default: + updateDrawBuffer_I1_D1(calculateInterpCoefficient()); + break; + case CFLAG_NO_INTERPOLATION: + updateDrawBuffer_I0_D1(0x10000); + break; + case CFLAG_NO_DITHERING: + updateDrawBuffer_I1_D0(calculateInterpCoefficient()); + break; + case CFLAG_NO_INTERPOLATION | CFLAG_NO_DITHERING: + updateDrawBuffer_I0_D0(0x10000); + break; } + // Start sending the next frame over DMA + leds.show(); + // Performance counter, for monitoring frame rate externally perf_frameCounter++; } diff --git a/firmware/fc_draw.cpp b/firmware/fc_draw.cpp index 4d120ff..325149f 100644 --- a/firmware/fc_draw.cpp +++ b/firmware/fc_draw.cpp @@ -22,7 +22,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -static void updateDrawBuffer(unsigned interpCoefficient) +static void FCP_FN(updateDrawBuffer)(unsigned interpCoefficient) { /* * Update the LED draw buffer. In one step, we do the interpolation, @@ -77,7 +77,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) * This generates compact and efficient code using the BFI instruction. */ - uint32_t p0 = updatePixel(icPrev, icNext, + uint32_t p0 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 0), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 0), pResidual + LEDS_PER_STRIP * 3 * 0); @@ -107,7 +107,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p0b = p0 >> 22; o0.p0a = p0 >> 23; - uint32_t p1 = updatePixel(icPrev, icNext, + uint32_t p1 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 1), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 1), pResidual + LEDS_PER_STRIP * 3 * 1); @@ -137,7 +137,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p1b = p1 >> 22; o0.p1a = p1 >> 23; - uint32_t p2 = updatePixel(icPrev, icNext, + uint32_t p2 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 2), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 2), pResidual + LEDS_PER_STRIP * 3 * 2); @@ -167,7 +167,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p2b = p2 >> 22; o0.p2a = p2 >> 23; - uint32_t p3 = updatePixel(icPrev, icNext, + uint32_t p3 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 3), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 3), pResidual + LEDS_PER_STRIP * 3 * 3); @@ -197,7 +197,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p3b = p3 >> 22; o0.p3a = p3 >> 23; - uint32_t p4 = updatePixel(icPrev, icNext, + uint32_t p4 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 4), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 4), pResidual + LEDS_PER_STRIP * 3 * 4); @@ -227,7 +227,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p4b = p4 >> 22; o0.p4a = p4 >> 23; - uint32_t p5 = updatePixel(icPrev, icNext, + uint32_t p5 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 5), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 5), pResidual + LEDS_PER_STRIP * 3 * 5); @@ -257,7 +257,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p5b = p5 >> 22; o0.p5a = p5 >> 23; - uint32_t p6 = updatePixel(icPrev, icNext, + uint32_t p6 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 6), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 6), pResidual + LEDS_PER_STRIP * 3 * 6); @@ -287,7 +287,7 @@ static void updateDrawBuffer(unsigned interpCoefficient) o0.p6b = p6 >> 22; o0.p6a = p6 >> 23; - uint32_t p7 = updatePixel(icPrev, icNext, + uint32_t p7 = FCP_FN(updatePixel)(icPrev, icNext, buffers.fbPrev->pixel(i + LEDS_PER_STRIP * 7), buffers.fbNext->pixel(i + LEDS_PER_STRIP * 7), pResidual + LEDS_PER_STRIP * 3 * 7); diff --git a/firmware/fc_pixel.cpp b/firmware/fc_pixel.cpp index 12b2f8c..209b2f3 100644 --- a/firmware/fc_pixel.cpp +++ b/firmware/fc_pixel.cpp @@ -22,62 +22,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - -ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_t arg) -{ - /* - * Using our color LUT for the indicated channel, convert the - * 16-bit intensity "arg" in our input colorspace to a corresponding - * 16-bit intensity in the device colorspace. - * - * Remember that our LUT is 257 entries long. The final entry corresponds to an - * input of 0x10000, which can't quite be reached. - * - * 'arg' is in the range [0, 0xFFFF] - * - * This operation is equivalent to the following: - * - * unsigned index = arg >> 8; // Range [0, 0xFF] - * unsigned alpha = arg & 0xFF; // Range [0, 0xFF] - * unsigned invAlpha = 0x100 - alpha; // Range [1, 0x100] - * - * // Result in range [0, 0xFFFF] - * return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8; - * - * This is easy to understand, but it turns out to be a serious bottleneck - * in terms of speed and memory bandwidth, as well as register pressure that - * affects the compilation of updatePixel(). - * - * To speed this up, we try and do the lut[index] and lut[index+1] portions - * in parallel using the SMUAD instruction. This is a pair of 16x16 multiplies, - * and the results are added together. We can combine this with an unaligned load - * to grab two adjacent entries from the LUT. The remaining complications are: - * - * 1. We wanted unsigned, not signed - * 2. We still need to generate the input values efficiently. - * - * (1) is easy to solve if we're okay with 15-bit precision for the LUT instead - * of 16-bit, which is fine. During LUT preparation, we right-shift each entry - * by 1, keeping them within the positive range of a signed 16-bit int. - * - * For (2), we need to quickly put 'alpha' in the high halfword and invAlpha in - * the low halfword, or vice versa. One fast way to do this is (0x01000000 + x - (x << 16). - */ - - uint32_t index = arg >> 8; // Range [0, 0xFF] - - // Load lut[index] into low halfword, lut[index+1] into high halfword. - uint32_t pair = *(const uint32_t*)(lut + index); - - unsigned alpha = arg & 0xFF; // Range [0, 0xFF] - - // Reversed halfword order - uint32_t pairAlpha = (0x01000000 + alpha - (alpha << 16)); - - return __SMUADX(pairAlpha, pair) >> 7; -} - -static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, +static uint32_t FCP_FN(updatePixel)(uint32_t icPrev, uint32_t icNext, const uint8_t *pixelPrev, const uint8_t *pixelNext, residual_t *pResidual) { /* @@ -92,22 +37,30 @@ static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, * icPrev + icNext = 0x1010000 */ +#if FCP_INTERPOLATION // Per-channel linear interpolation and conversion to 16-bit color. // Result range: [0, 0xFFFF] int iR = (pixelPrev[0] * icPrev + pixelNext[0] * icNext) >> 16; int iG = (pixelPrev[1] * icPrev + pixelNext[1] * icNext) >> 16; int iB = (pixelPrev[2] * icPrev + pixelNext[2] * icNext) >> 16; +#else + int iR = pixelNext[0] * 0x101; + int iG = pixelNext[1] * 0x101; + int iB = pixelNext[2] * 0x101; +#endif // Pass through our color LUT // Result range: [0, 0xFFFF] - iR = lutInterpolate(buffers.lutCurrent.r, iR); - iG = lutInterpolate(buffers.lutCurrent.g, iG); - iB = lutInterpolate(buffers.lutCurrent.b, iB); + iR = FCP_FN(lutInterpolate)(buffers.lutCurrent.r, iR); + iG = FCP_FN(lutInterpolate)(buffers.lutCurrent.g, iG); + iB = FCP_FN(lutInterpolate)(buffers.lutCurrent.b, iB); +#if FCP_DITHERING // Incorporate the residual from last frame iR += pResidual[0]; iG += pResidual[1]; iB += pResidual[2]; +#endif /* * Round to the nearest 8-bit value. Clamping is necessary! @@ -124,10 +77,12 @@ static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext, int g8 = __USAT(iG + 0x80, 16) >> 8; int b8 = __USAT(iB + 0x80, 16) >> 8; +#if FCP_DITHERING // Compute the error, after expanding the 8-bit value back to 16-bit. pResidual[0] = iR - (r8 * 257); pResidual[1] = iG - (g8 * 257); pResidual[2] = iB - (b8 * 257); +#endif // Pack the result, in GRB order. return (g8 << 16) | (r8 << 8) | b8; diff --git a/firmware/fc_pixel_lut.cpp b/firmware/fc_pixel_lut.cpp new file mode 100644 index 0000000..daa260f --- /dev/null +++ b/firmware/fc_pixel_lut.cpp @@ -0,0 +1,86 @@ +/* + * Fadecandy Firmware: Low-level per-pixel LUT code + * (Included into fadecandy.cpp) + * + * Copyright (c) 2013 Micah Elizabeth Scott + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +ALWAYS_INLINE static inline +uint32_t FCP_FN(lutInterpolate)(const uint16_t *lut, uint32_t arg) +{ + /* + * Using our color LUT for the indicated channel, convert the + * 16-bit intensity "arg" in our input colorspace to a corresponding + * 16-bit intensity in the device colorspace. + * + * Remember that our LUT is 257 entries long. The final entry corresponds to an + * input of 0x10000, which can't quite be reached. + * + * 'arg' is in the range [0, 0xFFFF] + * + * This operation is equivalent to the following: + * + * unsigned index = arg >> 8; // Range [0, 0xFF] + * unsigned alpha = arg & 0xFF; // Range [0, 0xFF] + * unsigned invAlpha = 0x100 - alpha; // Range [1, 0x100] + * + * // Result in range [0, 0xFFFF] + * return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8; + * + * This is easy to understand, but it turns out to be a serious bottleneck + * in terms of speed and memory bandwidth, as well as register pressure that + * affects the compilation of updatePixel(). + * + * To speed this up, we try and do the lut[index] and lut[index+1] portions + * in parallel using the SMUAD instruction. This is a pair of 16x16 multiplies, + * and the results are added together. We can combine this with an unaligned load + * to grab two adjacent entries from the LUT. The remaining complications are: + * + * 1. We wanted unsigned, not signed + * 2. We still need to generate the input values efficiently. + * + * (1) is easy to solve if we're okay with 15-bit precision for the LUT instead + * of 16-bit, which is fine. During LUT preparation, we right-shift each entry + * by 1, keeping them within the positive range of a signed 16-bit int. + * + * For (2), we need to quickly put 'alpha' in the high halfword and invAlpha in + * the low halfword, or vice versa. One fast way to do this is (0x01000000 + x - (x << 16). + */ + +#if FCP_INTERPOLATION + + uint32_t index = arg >> 8; // Range [0, 0xFF] + + // Load lut[index] into low halfword, lut[index+1] into high halfword. + uint32_t pair = *(const uint32_t*)(lut + index); + + unsigned alpha = arg & 0xFF; // Range [0, 0xFF] + + // Reversed halfword order + uint32_t pairAlpha = (0x01000000 + alpha - (alpha << 16)); + + return __SMUADX(pairAlpha, pair) >> 7; + +#else + // Simpler non-interpolated version + return lut[arg >> 8] << 1; +#endif +} -- GitLab