Newer
Older
/*
* Fadecandy Firmware: Low-level pixel update code
* (Included into fadecandy.cpp)
*
* Copyright (c) 2013 Micah Elizabeth Scott
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_t arg)
{
/*
* Using our color LUT for the indicated channel, convert the
* 16-bit intensity "arg" in our input colorspace to a corresponding
* 16-bit intensity in the device colorspace.
*
* Remember that our LUT is 257 entries long. The final entry corresponds to an
* input of 0x10000, which can't quite be reached.
*
* 'arg' is in the range [0, 0xFFFF]
*
* This operation is equivalent to the following:
*
* unsigned index = arg >> 8; // Range [0, 0xFF]
* unsigned alpha = arg & 0xFF; // Range [0, 0xFF]
* unsigned invAlpha = 0x100 - alpha; // Range [1, 0x100]
*
* // Result in range [0, 0xFFFF]
* return (lut[index] * invAlpha + lut[index + 1] * alpha) >> 8;
*
* This is easy to understand, but it turns out to be a serious bottleneck
* in terms of speed and memory bandwidth, as well as register pressure that
* affects the compilation of updatePixel().
*
* To speed this up, we try and do the lut[index] and lut[index+1] portions
* in parallel using the SMUAD instruction. This is a pair of 16x16 multiplies,
* and the results are added together. We can combine this with an unaligned load
* to grab two adjacent entries from the LUT. The remaining complications are:
*
* 1. We wanted unsigned, not signed
* 2. We still need to generate the input values efficiently.
*
* (1) is easy to solve if we're okay with 15-bit precision for the LUT instead
* of 16-bit, which is fine. During LUT preparation, we right-shift each entry
* by 1, keeping them within the positive range of a signed 16-bit int.
*
* For (2), we need to quickly put 'alpha' in the high halfword and invAlpha in
* the low halfword, or vice versa. One fast way to do this is (0x01000000 + x - (x << 16).
uint32_t index = arg >> 8; // Range [0, 0xFF]
// Load lut[index] into low halfword, lut[index+1] into high halfword.
uint32_t pair = *(const uint32_t*)(lut + index);
unsigned alpha = arg & 0xFF; // Range [0, 0xFF]
// Reversed halfword order
uint32_t pairAlpha = (0x01000000 + alpha - (alpha << 16));
return __SMUADX(pairAlpha, pair) >> 7;
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
}
static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext,
const uint8_t *pixelPrev, const uint8_t *pixelNext, residual_t *pResidual)
{
/*
* Update pipeline for one pixel:
*
* 1. Interpolate framebuffer
* 2. Interpolate LUT
* 3. Dithering
*
* icPrev in range [0, 0x1010000]
* icNext in range [0, 0x1010000]
* icPrev + icNext = 0x1010000
*/
// Per-channel linear interpolation and conversion to 16-bit color.
// Result range: [0, 0xFFFF]
int iR = (pixelPrev[0] * icPrev + pixelNext[0] * icNext) >> 16;
int iG = (pixelPrev[1] * icPrev + pixelNext[1] * icNext) >> 16;
int iB = (pixelPrev[2] * icPrev + pixelNext[2] * icNext) >> 16;
// Pass through our color LUT
// Result range: [0, 0xFFFF]
iR = lutInterpolate(buffers.lutCurrent.r, iR);
iG = lutInterpolate(buffers.lutCurrent.g, iG);
iB = lutInterpolate(buffers.lutCurrent.b, iB);
// Incorporate the residual from last frame
iR += pResidual[0];
iG += pResidual[1];
iB += pResidual[2];
/*
* Round to the nearest 8-bit value. Clamping is necessary!
* This value might be as low as -128 prior to adding 0x80
* for rounding. After this addition, the result is guaranteed
* to be >= 0, but it may be over 0xffff.
*
* This rules out clamping using the UQADD16 instruction,
* since the addition itself needs to allow overflow. Instead,
* we clamp using a separate USAT instruction.
*/
int r8 = __USAT(iR + 0x80, 16) >> 8;
int g8 = __USAT(iG + 0x80, 16) >> 8;
int b8 = __USAT(iB + 0x80, 16) >> 8;
// Compute the error, after expanding the 8-bit value back to 16-bit.
pResidual[0] = iR - (r8 * 257);
pResidual[1] = iG - (g8 * 257);
pResidual[2] = iB - (b8 * 257);
// Pack the result, in GRB order.
return (g8 << 16) | (r8 << 8) | b8;
}