diff --git a/bootloader/bootloader.c b/bootloader/bootloader.c
index 9d68f488953e9d914dd65450797e7eef7cafffa7..85cd02fd346b204895062dbcfee7c95c6293df2e 100644
--- a/bootloader/bootloader.c
+++ b/bootloader/bootloader.c
@@ -84,12 +84,12 @@ int main()
     usb_init();
 
     serial_begin(BAUD2DIV(115200));
-    serial_print("Hello from DFU!\n");
+    serial_print("Hello from DFU!\r\n");
 
     while (1) {
     	serial_phex32(dfu_getstate());
     	serial_putchar(' ');
     	serial_phex32(debug);
-    	serial_putchar('\n');
+    	serial_print("\r\n");
     }
 }
diff --git a/bootloader/dfu.c b/bootloader/dfu.c
index 8e40825c03e434437c41fd5c76ecdf70d14ebcbf..766a9b28632e0eacdcc18250640d1a4afc6c05b3 100644
--- a/bootloader/dfu.c
+++ b/bootloader/dfu.c
@@ -26,12 +26,12 @@
 #include "usb_dev.h"
 #include "dfu.h"
 
+extern uint32_t debug;
+
 static dfu_state_t dfu_state = dfuIDLE;
 static dfu_status_t dfu_status = OK;
 static unsigned dfu_poll_timeout = 1;
 
-extern uint32_t debug;
-
 // Programming buffer in MK20DX128 FlexRAM, where the flash controller can quickly access it.
 static __attribute__ ((section(".flexram"))) uint8_t dfu_buffer[DFU_TRANSFER_SIZE];
 
@@ -101,6 +101,8 @@ bool dfu_download(unsigned blockNum, unsigned blockLength,
 	// Store more data...
 	memcpy(dfu_buffer + packetOffset, data, packetLength);
 
+debug++;
+
 	if (packetOffset + packetLength != blockLength) {
 		// Still waiting for more data.
 		return true;
@@ -137,8 +139,6 @@ bool dfu_download(unsigned blockNum, unsigned blockLength,
 
 bool dfu_getstatus(uint8_t *status)
 {
-	debug++;
-
 	switch (dfu_state) {
 
 		case dfuDNLOAD_SYNC:
diff --git a/bootloader/usb_dev.c b/bootloader/usb_dev.c
index ef98b6e7b582dbb412f739f21d45838cf285c1be..1bd97edf5530fff72f85a0fccf87c922eef7df27 100644
--- a/bootloader/usb_dev.c
+++ b/bootloader/usb_dev.c
@@ -58,6 +58,11 @@ static bdt_t table[4];  // EP0 only
                 | ((data) ? BDT_DATA1 : BDT_DATA0) \
                 | ((count) << 16))
 
+// No DTS, we want to accept either DATA0 or DATA1 for RX.
+// This makes buffer management way less awful for EP0 OUT with
+// multiple packets per transaction.
+#define BDT_DESC_RX(count)      (BDT_OWN | ((count) << 16))
+
 #define TX   1
 #define RX   0
 #define ODD  1
@@ -112,6 +117,8 @@ static uint16_t ep0_rx_offset;
 static uint8_t ep0_tx_bdt_bank = 0;
 static uint8_t ep0_tx_data_toggle = 0;
 
+static uint8_t reply_buffer[8];
+
 volatile uint8_t usb_configuration = 0;
 
 
@@ -120,7 +127,6 @@ static void endpoint0_stall(void)
     USB0_ENDPT0 = USB_ENDPT_EPSTALL | USB_ENDPT_EPRXEN | USB_ENDPT_EPTXEN | USB_ENDPT_EPHSHK;
 }
 
-
 static void endpoint0_transmit(const void *data, uint32_t len)
 {
     table[index(0, TX, ep0_tx_bdt_bank)].addr = (void *)data;
@@ -129,14 +135,6 @@ static void endpoint0_transmit(const void *data, uint32_t len)
     ep0_tx_bdt_bank ^= 1;
 }
 
-static void endpoint0_rx_release(bdt_t *b)
-{
-    // Return an RX buffer to the SIE. We keep even/odd buffers in sync with data toggle.
-    b->desc = BDT_DESC(EP0_SIZE, ((uint32_t)b & 8) ? DATA1 : DATA0);
-}
-
-static uint8_t reply_buffer[8];
-
 static void usb_setup(void)
 {
     const uint8_t *data = NULL;
@@ -275,7 +273,7 @@ static void usb_setup(void)
         endpoint0_stall();
         return;
     }
-    send:
+ send:
 
     if (datalen > setup.wLength) datalen = setup.wLength;
     size = datalen;
@@ -314,14 +312,15 @@ static void usb_control(uint32_t stat)
         setup.word1 = *(uint32_t *)(buf);
         setup.word2 = *(uint32_t *)(buf + 4);
 
-        endpoint0_rx_release(b);
+        // Give the buffer back
+        b->desc = BDT_DESC_RX(EP0_SIZE);
 
         // clear any leftover pending IN transactions
         ep0_tx_ptr = NULL;
         table[index(0, TX, EVEN)].desc = 0;
         table[index(0, TX, ODD)].desc = 0;
 
-        // first IN after Setup is always DATA1
+        // first IN or OUT after Setup is always DATA1
         ep0_tx_data_toggle = 1;
 
         // If we're receiving, start at the beginning
@@ -335,16 +334,21 @@ static void usb_control(uint32_t stat)
         // The only control OUT request we have now, DFU_DNLOAD
         if (setup.wRequestAndType == 0x0121 && setup.wIndex == 0 &&
             ep0_rx_offset <= setup.wLength) {
+            bool success;
 
             size = setup.wLength - ep0_rx_offset;
             if (size > EP0_SIZE) size = EP0_SIZE;
 
-            if (dfu_download(setup.wValue,   // blockNum
-                             setup.wLength,  // blockLength
-                             ep0_rx_offset,  // packetOffset
-                             size,           // packetLength
-                             buf)) {         // data
+            success = dfu_download(setup.wValue,   // blockNum
+                                   setup.wLength,  // blockLength
+                                   ep0_rx_offset,  // packetOffset
+                                   size,           // packetLength
+                                   buf);           // data
+
+            // Give the buffer back
+            b->desc = BDT_DESC_RX(EP0_SIZE);
 
+            if (success) {
                 ep0_rx_offset += size;
                 if (ep0_rx_offset >= setup.wLength) {
                     // End of transaction, acknowledge with a zero-length IN                
@@ -353,12 +357,13 @@ static void usb_control(uint32_t stat)
             } else {
                 endpoint0_stall();
             }
+        } else {
+            // Give the buffer back
+            b->desc = BDT_DESC_RX(EP0_SIZE);
         }
-        endpoint0_rx_release(b);
         break;
 
     case 0x09: // IN transaction completed to host
-
         // send remaining data, if any...
         data = ep0_tx_ptr;
         if (data) {
@@ -374,7 +379,6 @@ static void usb_control(uint32_t stat)
             setup.bRequest = 0;
             USB0_ADDR = setup.wValue;
         }
-
         break;
     }
     USB0_CTL = USB_CTL_USBENSOFEN; // clear TXSUSPENDTOKENBUSY bit
@@ -412,9 +416,9 @@ restart:
         ep0_tx_bdt_bank = 0;
 
         // set up buffers to receive Setup and OUT packets
-        table[index(0, RX, EVEN)].desc = BDT_DESC(EP0_SIZE, 0);
+        table[index(0, RX, EVEN)].desc = BDT_DESC_RX(EP0_SIZE);
         table[index(0, RX, EVEN)].addr = ep0_rx0_buf;
-        table[index(0, RX, ODD)].desc = BDT_DESC(EP0_SIZE, 1);
+        table[index(0, RX, ODD)].desc = BDT_DESC_RX(EP0_SIZE);
         table[index(0, RX, ODD)].addr = ep0_rx1_buf;
         table[index(0, TX, EVEN)].desc = 0;
         table[index(0, TX, ODD)].desc = 0;