diff --git a/firmware/src/gfx_decoder.cpp b/firmware/src/gfx_decoder.cpp
index 491024f..2d7b619 100644
--- a/firmware/src/gfx_decoder.cpp
+++ b/firmware/src/gfx_decoder.cpp
@@ -44,6 +44,7 @@ int32_t gfx_decoder_loadNextFrame() {
   // Data will be held in buffers, one per pixel's depth bit (aka brightness stage),
   // with each row split into 32-bit chunks, one per module
   // (20 pixels, 24 shift register stages, 8 unused bits)
+  // Rows are inverted, because that's how they're fed to the shift registers
   // TODO: Move this to leds.cpp
   // TODO: Use a separate buffer, then copy to ledsBuffer to avoid tearing
   for (int bi = 0; bi < 8; bi++) {
@@ -67,7 +68,7 @@ int32_t gfx_decoder_loadNextFrame() {
         // insert placeholder for unused last stage (after pixel 19)
         sample >>=1;
 
-        ledBuffer[bi][y * COL_MODULES + xModule] = sample;
+        ledBuffer[bi][(ROW_COUNT - 1 - y) * COL_MODULES + xModule] = sample;
       }
     }
   }
diff --git a/firmware/src/leds.cpp b/firmware/src/leds.cpp
index f9916ba..a7a1302 100644
--- a/firmware/src/leds.cpp
+++ b/firmware/src/leds.cpp
@@ -16,8 +16,6 @@ inline void pulsePin(uint8_t pin) {
    // there are glitches without this (maybe just due to breadboard...)
   _NOP();
   _NOP();
-  _NOP();
-    // busy_wait_us_32(50);
   gpio_put(pin, LOW);
 }
 
@@ -94,6 +92,10 @@ void leds_initRenderer() {
 }
 
 void leds_render() {
+  // brightness phase
+  bool brightPhase = brightnessPhase >= 3;
+  auto buffer = ledBuffer[brightnessPhase + 3];
+
   // hide output
   outputEnable(ROW_OE, false);
 
@@ -103,65 +105,67 @@ void leds_render() {
   // start selecting rows
   gpio_put(ROW_SER, HIGH);
 
-  for (int yCount = 0; yCount < ROW_COUNT; yCount++) {
-    int y = ROW_COUNT - 1 - yCount;
-    // brigthness - pushing data takes time, so to maximize brightness (at high brightness phases)
-    // we want to keep the matrix on during update (except during latch). At low brightness phases,
-    // we want it off to actually be dim
-    bool brightPhase = brightnessPhase >= 2;
-    outputEnable(ROW_OE, brightPhase);
+  int bufferOffset = 0;
+  for (int yModule = 0; yModule < ROW_MODULES; yModule++) {
+    for (int moduleY = 0; moduleY < 20; moduleY++) {
+      // brigthness - pushing data takes time, so to maximize brightness (at high brightness phases)
+      // we want to keep the matrix on during update (except during latch). At low brightness phases,
+      // we want it off to actually be dim
+      outputEnable(ROW_OE, brightPhase);
 
-    // next row
-    pulsePin(ROW_SRCLK);
-    // only one row
-    gpio_put(ROW_SER, LOW);
-
-    // we use 7/8 stages on shift registers + 1 is unused
-    int moduleY = yCount % 20;
-    if (moduleY == 0) {
+      // next row
       pulsePin(ROW_SRCLK);
+      // only one row
+      gpio_put(ROW_SER, LOW);
+
+      // we use 7/8 stages on shift registers + 1 is unused
+      if (moduleY == 0) {
+        pulsePin(ROW_SRCLK);
+      }
+
+      if (moduleY == 7 || moduleY == 14 || (moduleY == 0 && yModule != 0)) {
+        pulsePin(ROW_SRCLK);
+      }
+
+      // set row data
+      // NOTE: values are loaded right-left
+      // Optimized implementation: use PIO, avoid division, modulo, etc...
+      // we use 7/8 stages of each shift register + 1 is unused so we need to do
+      // silly shit
+      // TODO: Some ideas for future optimization:
+      // - see if we can disable px pusher delays on improved electric interface
+      // - improve outer loop which adds 2us of processing on each loop
+      // - change busy wait into some kind of interrupt-based thing so that processing can continue
+      // - latch row and clock simultaneously, avoid disabling output
+      // - DMA?
+      for (int xModule = 0; xModule < COL_MODULES; xModule++) {
+        uint32_t pxValues = buffer[bufferOffset + xModule];
+        pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues);
+      }
+
+      // wait for all data to be shifted out
+      while (!pio_sm_is_tx_fifo_empty(pusher_pio, pusher_sm)) {
+        tight_loop_contents();
+      }
+      // TODO: Is there an API to wait for PIO to actually become idle?
+      // pio_sm_drain_tx_fifo doesn't seem to do the trick
+      // if not, we might need to use irqs or something
+      busy_wait_us(4);
+
+      // latch rows and columns
+      gpio_set_mask(1 << ROW_RCLK | 1 << COL_RCLK);
+      _NOP();
+      _NOP();
+      gpio_clr_mask(1 << ROW_RCLK | 1 << COL_RCLK);
+
+      // show for a certain period
+      outputEnable(ROW_OE, true);
+      busy_wait_us_32(brightnessPhaseDelays[brightnessPhase]);
+      outputEnable(ROW_OE, false);
+
+      // next row
+      bufferOffset += COL_MODULES;
     }
-
-    if (moduleY == 7 || moduleY == 14 || (moduleY == 0 && yCount != 0)) {
-      pulsePin(ROW_SRCLK);
-    }
-
-    // set row data
-    // NOTE: values are loaded right-left
-    // Optimized implementation: use PIO, avoid division, modulo, etc...
-    // we use 7/8 stages of each shift register + 1 is unused so we need to do
-    // silly shit
-    // TODO: Some ideas for future optimization:
-    // - see if we can disable px pusher delays on improved electric interface
-    // - improve outer loop which adds 2us of processing on each loop
-    // - change busy wait into some kind of interrupt-based thing so that processing can continue
-    // - latch row and clock simultaneously, avoid disabling output
-    // - DMA?
-    for (int xModule = 0; xModule < COL_MODULES; xModule++) {
-      uint32_t pxValues = ledBuffer[brightnessPhase + 3][y * COL_MODULES + xModule];
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues);
-    }
-
-    // wait for all data to be shifted out
-    while (!pio_sm_is_tx_fifo_empty(pusher_pio, pusher_sm)) {
-      tight_loop_contents();
-    }
-    // TODO: Is there an API to wait for PIO to actually become idle?
-    // pio_sm_drain_tx_fifo doesn't seem to do the trick
-    // if not, we might need to use irqs or something
-    busy_wait_us(4);
-
-    // disable columns before latch
-    outputEnable(ROW_OE, false);
-
-    // latch rows and columns
-    pulsePin(ROW_RCLK);
-    pulsePin(COL_RCLK);
-
-    // show for a certain period
-    outputEnable(ROW_OE, true);
-    busy_wait_us_32(brightnessPhaseDelays[brightnessPhase]);
-    outputEnable(ROW_OE, false);
   }
 
   // next brightness phase
diff --git a/firmware/src/leds.h b/firmware/src/leds.h
index 8d28034..f884621 100644
--- a/firmware/src/leds.h
+++ b/firmware/src/leds.h
@@ -16,7 +16,8 @@
 #define ROW_SRCLK 11
 #define ROW_SRCLR 10
 
-#define ROW_COUNT 40
+#define ROW_MODULES 2
+#define ROW_COUNT ROW_MODULES * 20
 #define COL_MODULES 2
 #define COL_COUNT COL_MODULES * 20