diff --git a/README.md b/README.md
index 1a1a360a1efd84373291a24b45eb09fa96d5f18b..26b5a1a39537559decb8c6683768680a7e2b38b0 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ This firmware is based on Stoffregen's excellent [OctoWS2811](http://www.pjrc.co
 * A high performance USB protocol
 * Zero copy architecture with triple-buffering
 * Interpolation between keyframes
-* Gamma and color correction with per-channel 256-entry lookup tables
+* Gamma and color correction with per-channel lookup tables
 * Temporal dithering
 
 These features add up to give *very smooth* fades and high dynamic range. Ever notice that annoying stair-stepping effect when fading LEDs from off to dim? Fadecandy avoids that using a form of [delta-sigma modulation](http://en.wikipedia.org/wiki/Delta-sigma_modulation). It rapidly wiggles each pixel's value up or down by one 8-bit step, in order to achieve 16-bit resolution for fades.
@@ -21,7 +21,7 @@ Vitals
 * 512 pixels supported per Teensy board (8 strings, 64 pixels per string)
 * Very high hardware frame rate (395 FPS) to support temporal dithering
 * Full-speed (12 Mbps) USB
-* 768-entry 16-bit color lookup table, for gamma correction and color balance
+* 257x3-entry 16-bit color lookup table, for gamma correction and color balance
 
 Color Processing
 ----------------
@@ -39,6 +39,8 @@ Each pixel goes through the following processing steps in Fadecandy:
 * These 8-bit colors are converted to the format needed by OctoWS2811's DMA engine
 * In hardware, the converted colors are streamed out to eight LED strings in parallel
 
+The color lookup tables can be used to implement gamma correction, brightness and contrast, and white point correction. Each channel (RGB) has a 257 entry table. Each entry is a 16-bit intensity. Entry 0 corresponds to the 16-bit color 0x0000, entry 1 corresponds to 0x0100, etc. The 257th entry corresponds to 0x10000, which is just past the end of the 16-bit intensity space.
+
 Keyframe Interpolation
 ----------------------
 
@@ -142,7 +144,7 @@ Byte Offset   | Description
 62            | Pixel 20, Green
 63            | Pixel 20, Blue
 
-In a type 1 packet, the USB packet contains up to 31 lookup-table entries. The lookup table is structured as three arrays of 256 entries, starting with the entire red-channel LUT, then the green-channel LUT, then the blue-channel LUT. Each packet is structured as follows:
+In a type 1 packet, the USB packet contains up to 31 lookup-table entries. The lookup table is structured as three arrays of 257 entries, starting with the entire red-channel LUT, then the green-channel LUT, then the blue-channel LUT. Each packet is structured as follows:
 
 Byte Offset   | Description
 ------------- | ------------
diff --git a/firmware/fadecandy.cpp b/firmware/fadecandy.cpp
index caf48002c210f3766ff7f293e5d2f784bcff0d57..6e464b0716a322c4235abd426a93f9063c076622 100644
--- a/firmware/fadecandy.cpp
+++ b/firmware/fadecandy.cpp
@@ -75,6 +75,9 @@ ALWAYS_INLINE static inline uint32_t lutInterpolate(const uint16_t *lut, uint32_
      * Using our color LUT for the indicated channel, convert the
      * 16-bit intensity "arg" in our input colorspace to a corresponding
      * 16-bit intensity in the device colorspace.
+     *
+     * Remember that our LUT is 257 entries long. The final entry corresponds to an
+     * input of 0x10000, which can't quite be reached.
      */
 
     unsigned index = arg >> 8;
@@ -102,9 +105,9 @@ static uint32_t updatePixel(uint32_t icPrev, uint32_t icNext,
     int iB = (pixelPrev[2] * icPrev + pixelNext[2] * icNext) >> 16;
 
     // Pass through our color LUT
-    iR = lutInterpolate(&lut[0 * 256], iR);
-    iG = lutInterpolate(&lut[1 * 256], iG);
-    iB = lutInterpolate(&lut[2 * 256], iB);
+    iR = lutInterpolate(&lut[0 * LUT_CH_SIZE], iR);
+    iG = lutInterpolate(&lut[1 * LUT_CH_SIZE], iG);
+    iB = lutInterpolate(&lut[2 * LUT_CH_SIZE], iB);
 
     // Incorporate the residual from last frame
     iR += pResidual[0];
diff --git a/firmware/fc_defs.h b/firmware/fc_defs.h
index ce272a22d6097f1865cb40ba73c09f1857cdf3ea..b93bb5baab57556de2cf6644f2a5116e4d3b56b1 100644
--- a/firmware/fc_defs.h
+++ b/firmware/fc_defs.h
@@ -31,7 +31,8 @@
 #define LEDS_TOTAL              (LEDS_PER_STRIP * 8)
 #define CHANNELS_TOTAL          (LEDS_TOTAL * 3)
 
-#define LUT_SIZE				(256 * 3)
+#define LUT_CH_SIZE				257
+#define LUT_TOTAL_SIZE			(LUT_CH_SIZE * 3)
 
 // USB packet layout
 #define PIXELS_PER_PACKET       21
diff --git a/firmware/fc_usb.cpp b/firmware/fc_usb.cpp
index 53b0e01f5062bae3339bf4dafbde74187ba2e7b0..7bbba7ac09388b2c268aab087c91dc09528d5382 100644
--- a/firmware/fc_usb.cpp
+++ b/firmware/fc_usb.cpp
@@ -106,10 +106,7 @@ void fcBuffers::finalizeLUT()
      * so this isn't a performance bottleneck.
      */
 
-    for (unsigned i = 0; i < LUT_SIZE; ++i) {
+    for (unsigned i = 0; i < LUT_TOTAL_SIZE; ++i) {
         lutCurrent[i] = lutNew.entry(i);
     }
-
-    // Padding, so that it's okay to read past the end during interpolation
-    lutCurrent[LUT_SIZE] = lutCurrent[LUT_SIZE - 1];
 }
diff --git a/firmware/fc_usb.h b/firmware/fc_usb.h
index d1a019efd77aee667de56e7fb427a1730bbb42bf..c38bc3717c6e1a02a1d052722236b37d9859529f 100644
--- a/firmware/fc_usb.h
+++ b/firmware/fc_usb.h
@@ -110,8 +110,8 @@ struct fcBuffers
 
     fcFramebuffer fb[3];        // Triple-buffered video frames
 
-    fcColorLUT lutNew;                   // Partial LUT, not yet finalized
-    uint16_t lutCurrent[LUT_SIZE + 1];   // Active LUT, linearized for efficiency, padded on the end.
+    fcColorLUT lutNew;                      // Partial LUT, not yet finalized
+    uint16_t lutCurrent[LUT_TOTAL_SIZE];    // Active LUT, linearized for efficiency
 
     uint8_t flags;              // Configuration flags