Revert "optimize: use PWM interrupt based delays for precision and pipelining"

This reverts commit 9f9d089e08.
Revert "wip optimize"
2024-05-27 13:35:38 +02:00 · 2024-05-27 13:35:34 +02:00 · 2024-05-27 13:35:22 +02:00 · 2024-05-27 10:15:22 +02:00 · 2024-05-26 23:02:06 +02:00 · 2024-05-26 22:01:38 +02:00
6 changed files with 166 additions and 104 deletions
--- a/firmware/src/gfx_decoder.cpp
+++ b/firmware/src/gfx_decoder.cpp
@ -40,6 +40,46 @@ int32_t gfx_decoder_loadNextFrame() {
    return false;
  }

+  // Convert framebuffer into raw shift register data for fast PIO pixel pushing
+  // Data will be held in buffers, one per pixel's depth bit (aka brightness stage),
+  // with each row split into 32-bit chunks, one per module
+  // (20 pixels, 24 shift register stages, 8 unused bits)
+  // Rows are inverted, because that's how they're fed to the shift registers
+  // TODO: Move this to leds.cpp
+  // TODO: Use a separate buffer, then copy to ledsBuffer to avoid tearing
+  for (int bi = 0; bi < 8; bi++) {
+    uint8_t bitPosition = 1 << bi;
+    for (int y = 0; y < ROW_COUNT; y++) {
+      auto yOffset = y * COL_COUNT;
+      for (int xModule = 0; xModule < COL_MODULES; xModule++) {
+        auto bufferXOffset = yOffset + xModule * 20;
+        uint32_t sample = 0;
+
+        for (int x = 0; x < 20; x++) {
+          // insert placeholders for unused stages
+          // (before pixels 0, 6, 13)
+          if (x == 0 || x == 6 || x == 13) {
+            sample >>= 1;
+          }
+          uint8_t px = buffer[bufferXOffset + x];
+          bool bit = px & bitPosition;
+          sample = (sample >> 1) | (bit ? 0x80000000 : 0);
+        }
+        // insert placeholder for unused last stage (after pixel 19)
+        sample >>=1;
+        // shift to LSB position
+        sample >>=8;
+        // MSB=1 indicates end of row
+        if (xModule == COL_MODULES - 1) {
+          sample |= 0x80000000;
+        }
+
+        ledBuffer[bi][(ROW_COUNT - 1 - y) * COL_MODULES + xModule] = sample;
+      }
+    }
+  }
+  ledBufferReady = true;
+
  // copy to framebuffer
  // TODO: mutex? double buffer? or something...
  memcpy(framebuffer, buffer, ROW_COUNT * COL_COUNT);
--- a/firmware/src/leds.cpp
+++ b/firmware/src/leds.cpp
@ -16,8 +16,6 @@ inline void pulsePin(uint8_t pin) {
   // there are glitches without this (maybe just due to breadboard...)
  _NOP();
  _NOP();
-  _NOP();
-    // busy_wait_us_32(50);
  gpio_put(pin, LOW);
 }

@ -38,6 +36,8 @@ uint8_t brightnessPhaseDelays[COLOR_BITS] = {0, 1, 6, 20, 60};

 // NOTE: Alignment required to allow 4-byte reads
 uint8_t framebuffer[ROW_COUNT * COL_COUNT]  __attribute__((aligned(32))) = {0};
+bool ledBufferReady = false;
+uint32_t ledBuffer[8][ROW_COUNT * COL_MODULES] = {0};

 void leds_init() {
  memset(framebuffer, 0, sizeof(framebuffer));
@ -50,7 +50,8 @@ void leds_init() {
  pinMode(COL_SER, OUTPUT);
  pinMode(COL_OE, OUTPUT);
  outputEnable(ROW_OE, false);
-  pinMode(COL_RCLK, OUTPUT);
+  // pinMode(COL_RCLK, OUTPUT);
+  pinMode(RCLK, OUTPUT);
  pinMode(COL_SRCLK, OUTPUT);
  pinMode(COL_SRCLR, OUTPUT);

@ -58,18 +59,18 @@ void leds_init() {
  pinMode(ROW_SER, OUTPUT);
  pinMode(ROW_OE, OUTPUT);
  outputEnable(ROW_OE, false);
-  pinMode(ROW_RCLK, OUTPUT);
+  // pinMode(ROW_RCLK, OUTPUT);
  pinMode(ROW_SRCLK, OUTPUT);
  pinMode(ROW_SRCLR, OUTPUT);

  // clear output - cols
  clearShiftReg(COL_SRCLK, COL_SRCLR);
-  pulsePin(COL_RCLK);
-  outputEnable(COL_OE, true);
+  pulsePin(RCLK);
+  outputEnable(COL_OE, true); // this is fine, because we control OE via rows only

  // clear output - rows
  clearShiftReg(ROW_SRCLK, ROW_SRCLR);
-  pulsePin(ROW_RCLK);
+  pulsePin(RCLK);
 }

 void leds_disable() {
@ -93,6 +94,15 @@ void leds_initRenderer() {
 }

 void leds_render() {
+  if (!ledBufferReady) {
+    outputEnable(ROW_OE, false);
+    return;
+  }
+
+  // brightness phase
+  bool brightPhase = brightnessPhase >= 3;
+  auto buffer = ledBuffer[brightnessPhase + 3];
+
  // hide output
  outputEnable(ROW_OE, false);

@ -102,88 +112,54 @@ void leds_render() {
  // start selecting rows
  gpio_put(ROW_SER, HIGH);

-  for (int yCount = 0; yCount < ROW_COUNT; yCount++) {
-    int y = ROW_COUNT - 1 - yCount;
-    // brigthness - pushing data takes 40us, so to maximize brightness (at high brightness phases)
-    // we want to keep the matrix on during update (except during latch). At low brightness phases,
-    // we want it off to actually be dim
-    bool brightPhase = brightnessPhase >= 2;
-    outputEnable(ROW_OE, brightPhase);
+  int bufferOffset = 0;
+  for (int yModule = 0; yModule < ROW_MODULES; yModule++) {
+    for (int moduleY = 0; moduleY < 20; moduleY++) {
+      // brigthness - pushing data takes time, so to maximize brightness (at high brightness phases)
+      // we want to keep the matrix on during update (except during latch). At low brightness phases,
+      // we want it off to actually be dim
+      outputEnable(ROW_OE, brightPhase);

-    // next row
-    pulsePin(ROW_SRCLK);
-    // only one row
-    gpio_put(ROW_SER, LOW);
-
-    // we use 7/8 stages on shift registers + 1 is unused
-    int moduleY = yCount % 20;
-    if (moduleY == 0) {
+      // next row
      pulsePin(ROW_SRCLK);
+      // only one row
+      gpio_put(ROW_SER, LOW);
+
+      // we use 7/8 stages on shift registers + 1 is unused
+      if (moduleY == 0) {
+        pulsePin(ROW_SRCLK);
+      }
+
+      if (moduleY == 7 || moduleY == 14 || (moduleY == 0 && yModule != 0)) {
+        pulsePin(ROW_SRCLK);
+      }
+
+      // set row data using PIO
+      // latch signal is also sent here
+      // TODO: Some ideas for future optimization:
+      // - see if we can disable px pusher delays on improved electric interface
+      // - improve outer loop which adds 2us of processing on each loop
+      // - change busy wait into some kind of interrupt-based thing so that processing can continue
+      // - DMA?
+      for (int xModule = 0; xModule < COL_MODULES; xModule++) {
+        uint32_t pxValues = buffer[bufferOffset + xModule];
+        pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues);
+      }
+
+      // wait until pushing and RCLK latch are done
+      while (!pio_interrupt_get(pusher_pio, 0)) {
+        tight_loop_contents();
+      }
+      pio_interrupt_clear(pusher_pio, pusher_sm);
+
+      // show for a certain period
+      outputEnable(ROW_OE, true);
+      busy_wait_us_32(brightnessPhaseDelays[brightnessPhase]);
+      outputEnable(ROW_OE, false);
+
+      // next row
+      bufferOffset += COL_MODULES;
    }
-
-    if (moduleY == 7 || moduleY == 14 || (moduleY == 0 && yCount != 0)) {
-      pulsePin(ROW_SRCLK);
-    }
-
-    // set row data
-    // NOTE: values are loaded right-left
-    // Optimized implementation: use PIO, avoid division, modulo, etc...
-    // we use 7/8 stages of each shift register + 1 is unused so we need to do
-    // silly shit
-    // TODO: Some ideas for future optimization:
-    // - see if we can disable px pusher delays on improved electric interface
-    // - use a profiler to see how the inner loop can be improved
-    // - do the shift register bullshit once per frame, so that data can be loaded into
-    //   registers with aligned access, DMA, etc.
-    // - improve outer loop which adds 2us of processing on each loop
-    // - change busy wait into some kind of interrupt-based thing so that processing can continue
-    // - latch row and clock simultaneously, avoid disabling output
-    uint8_t *buffer = framebuffer + (y * COL_COUNT);
-    for (int xModule = 0; xModule < COL_MODULES; xModule++) {
-      uint32_t pxValues;
-
-      // placeholder at 0; pixels 0, 1, 2
-      pxValues = *(reinterpret_cast<uint32_t *>(buffer));
-      pxValues = pxValues << 8;
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
-
-      // pixels 3, 4, 5, placeholder at 6
-      pxValues = *(reinterpret_cast<uint32_t *>(buffer + 3));
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
-
-      // pixels 6, 7, 8, 9
-      pxValues = *(reinterpret_cast<uint32_t *>(buffer + 6));
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
-
-      // pixels 10, 11, 12, placeholder at 13
-      pxValues = *(reinterpret_cast<uint32_t *>(buffer + 10));
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
-
-      // pixels 13, 14, 15, 16
-      pxValues = *(reinterpret_cast<uint32_t *>(buffer + 13));
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
-
-      // pixels 17, 18, 19, placeholder
-      pxValues = *(reinterpret_cast<uint32_t *>(buffer + 17));
-      pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
-
-      buffer += 20;
-    }
-
-    // wait for all data to be shifted out
-    pio_sm_drain_tx_fifo(pusher_pio, pusher_sm);
-
-    // disable columns before latch
-    outputEnable(ROW_OE, false);
-
-    // latch rows and columns
-    pulsePin(ROW_RCLK);
-    pulsePin(COL_RCLK);
-
-    // show for a certain period
-    outputEnable(ROW_OE, true);
-    busy_wait_us_32(brightnessPhaseDelays[brightnessPhase]);
-    outputEnable(ROW_OE, false);
  }

  // next brightness phase
@ -201,7 +177,7 @@ void leds_initPusher() {
  uint latchPin = COL_SRCLK;

  pio_sm_config config = leds_px_pusher_program_get_default_config(offset);
-  sm_config_set_clkdiv_int_frac(&config, 2, 0);
+  sm_config_set_clkdiv_int_frac(&config, 1, 0);

  // Shift OSR to the right, autopull
  sm_config_set_out_shift(&config, true, true, 32);
@ -214,11 +190,16 @@ void leds_initPusher() {
  // data is inverted
  gpio_set_outover(dataPin, GPIO_OVERRIDE_INVERT);

-  // Set SET (latch) pin, connect to pad, set as output
+  // Set sideset (SRCLK) pin, connect to pad, set as output
  sm_config_set_sideset_pins(&config, latchPin);
  pio_gpio_init(pio, latchPin);
  pio_sm_set_consecutive_pindirs(pio, sm, latchPin, 1, true);

+  // Set SET (RCLK) pin, connect to pad, set as output
+  sm_config_set_set_pins(&config, RCLK, 1);
+  pio_gpio_init(pio, RCLK);
+  pio_sm_set_consecutive_pindirs(pio, sm, RCLK, 1, true);
+
  // Load our configuration, and jump to the start of the program
  pio_sm_init(pio, sm, offset, &config);
  pio_sm_set_enabled(pio, sm, true);
--- a/firmware/src/leds.h
+++ b/firmware/src/leds.h
@ -6,17 +6,19 @@

 #define COL_SER 20
 #define COL_OE 21
-#define COL_RCLK 22
+// #define COL_RCLK 22
+#define RCLK 22
 #define COL_SRCLK 26
 #define COL_SRCLR 27

 #define ROW_SER 14
 #define ROW_OE 13
-#define ROW_RCLK 12
+// #define ROW_RCLK 12
 #define ROW_SRCLK 11
 #define ROW_SRCLR 10

-#define ROW_COUNT 40
+#define ROW_MODULES 2
+#define ROW_COUNT ROW_MODULES * 20
 #define COL_MODULES 2
 #define COL_COUNT COL_MODULES * 20

@ -31,5 +33,7 @@ void leds_loop();
 void leds_render();

 extern uint8_t framebuffer[ROW_COUNT * COL_COUNT];
+extern bool ledBufferReady;
+extern uint32_t ledBuffer[8][ROW_COUNT * COL_MODULES];

 #endif
--- a/firmware/src/leds.pio
+++ b/firmware/src/leds.pio
@ -1,9 +1,31 @@
 .program leds_px_pusher
 .side_set 1 opt
-.wrap_target
 public entry_point:
-  out null, 3   side 0 [0]    ; ignore least significant digits
-  out pins, 1                 ; set bit (shifted for brightness phase by C code)
-  out null, 4   side 1 [1]    ; ignore remaining bits, latch data, allow time for latching
-  nop           side 0        ; return to 0 (weird glitches happen otherwise)
-.wrap
+.wrap_target
+  ; get 32 bits from fifo (not required with autopull, useful for debug)
+  ; pull
+  ; push 24 bits to the shift registers
+  ; also, return latch bit to 0
+  set x, 23      side 0
+loop:
+  ; TODO: check if delays can be lowered with a PCB
+  ; set bit; lower clock edge
+  out pins, 1    side 0 [1]
+  ; loop; latch bit (rising edge)
+  jmp x-- loop   side 1 [2]
+end:
+  ; ignore unused bits
+  ; load MSBs into x
+  ; lower clock edge
+  out x, 8    side 0
+  ; MSB=1 indicates end of row
+  jmp x-- end_of_row
+  .wrap
+end_of_row:
+  ; indicate to main processor that row was processed
+  irq set 0
+  ; clock RCLK (latch onto register output stage)
+  set pins, 1 [3]
+  set pins, 0
+  ; wait for next row
+  jmp entry_point
--- a/firmware/src/leds.pio.h
+++ b/firmware/src/leds.pio.h
@ -13,23 +13,28 @@
 // -------------- //

 #define leds_px_pusher_wrap_target 0
-#define leds_px_pusher_wrap 3
+#define leds_px_pusher_wrap 4

 #define leds_px_pusher_offset_entry_point 0u

 static const uint16_t leds_px_pusher_program_instructions[] = {
            //     .wrap_target
-    0x7063, //  0: out    null, 3         side 0     
-    0x6001, //  1: out    pins, 1                    
-    0x7964, //  2: out    null, 4         side 1 [1] 
-    0xb042, //  3: nop                    side 0     
+    0xf037, //  0: set    x, 23           side 0     
+    0x7101, //  1: out    pins, 1         side 0 [1] 
+    0x1a41, //  2: jmp    x--, 1          side 1 [2] 
+    0x7028, //  3: out    x, 8            side 0     
+    0x0045, //  4: jmp    x--, 5                     
            //     .wrap
+    0xc000, //  5: irq    nowait 0                   
+    0xe301, //  6: set    pins, 1                [3] 
+    0xe000, //  7: set    pins, 0                    
+    0x0000, //  8: jmp    0                          
 };

 #if !PICO_NO_HARDWARE
 static const struct pio_program leds_px_pusher_program = {
    .instructions = leds_px_pusher_program_instructions,
-    .length = 4,
+    .length = 9,
    .origin = -1,
 };

--- a/firmware/src/sd_pinout.txt
+++ b/firmware/src/sd_pinout.txt
@ -1,3 +1,4 @@
+## SD breadboard connector

                  17
 DAT1   GND  +3V3   CS     SHLD
@ -6,7 +7,16 @@ DAT1   GND  +3V3   CS     SHLD
 DET   MISO   CLK   MOSI   DAT2
 28     16    18     19

-
+## Big SD card

 D2  D3/CS   CMD/MOSI   VSS1 VDD CLK  VSS2  D0/MISO  D1
 x    17        19       x    x   18   x      16     x
+
+## LED drivers
+
+- VCC
+- SER
+- OE
+- RCLK
+- SRCLK
+- SRCLR
Author	SHA1	Message	Date
radex	3880a58137	Revert "optimize: use PWM interrupt based delays for precision and pipelining" This reverts commit `9f9d089e08`.	2024-05-27 13:35:38 +02:00
radex	647224897e	Revert "wip optimize" This reverts commit `fa115e88cb`.	2024-05-27 13:35:34 +02:00
radex	fa115e88cb	wip optimize	2024-05-27 13:35:22 +02:00
radex	9f9d089e08	optimize: use PWM interrupt based delays for precision and pipelining	2024-05-27 10:15:22 +02:00
radex	2a953464fd	optimize: use irq to signal readiness	2024-05-26 23:02:06 +02:00
radex	fbbcf0746c	optimize: rclk via pio	2024-05-26 22:01:38 +02:00
radex	589f8a96ae	some microoptimizations	2024-05-26 17:30:40 +02:00
radex	5e368cf5d3	documentation	2024-05-26 16:51:17 +02:00
radex	5a34b7d99a	optimize leds stage	2024-05-26 16:51:08 +02:00