diff --git a/firmware/src/gfx_decoder.cpp b/firmware/src/gfx_decoder.cpp index 2b73355..491024f 100644 --- a/firmware/src/gfx_decoder.cpp +++ b/firmware/src/gfx_decoder.cpp @@ -40,6 +40,38 @@ int32_t gfx_decoder_loadNextFrame() { return false; } + // Convert framebuffer into raw shift register data for fast PIO pixel pushing + // Data will be held in buffers, one per pixel's depth bit (aka brightness stage), + // with each row split into 32-bit chunks, one per module + // (20 pixels, 24 shift register stages, 8 unused bits) + // TODO: Move this to leds.cpp + // TODO: Use a separate buffer, then copy to ledsBuffer to avoid tearing + for (int bi = 0; bi < 8; bi++) { + uint8_t bitPosition = 1 << bi; + for (int y = 0; y < ROW_COUNT; y++) { + auto yOffset = y * COL_COUNT; + for (int xModule = 0; xModule < COL_MODULES; xModule++) { + auto bufferXOffset = yOffset + xModule * 20; + uint32_t sample = 0; + + for (int x = 0; x < 20; x++) { + // insert placeholders for unused stages + // (before pixels 0, 6, 13) + if (x == 0 || x == 6 || x == 13) { + sample >>= 1; + } + uint8_t px = buffer[bufferXOffset + x]; + bool bit = px & bitPosition; + sample = (sample >> 1) | (bit ? 0x80000000 : 0); + } + // insert placeholder for unused last stage (after pixel 19) + sample >>=1; + + ledBuffer[bi][y * COL_MODULES + xModule] = sample; + } + } + } + // copy to framebuffer // TODO: mutex? double buffer? or something... memcpy(framebuffer, buffer, ROW_COUNT * COL_COUNT); diff --git a/firmware/src/leds.cpp b/firmware/src/leds.cpp index 5a50ba5..f9916ba 100644 --- a/firmware/src/leds.cpp +++ b/firmware/src/leds.cpp @@ -38,6 +38,7 @@ uint8_t brightnessPhaseDelays[COLOR_BITS] = {0, 1, 6, 20, 60}; // NOTE: Alignment required to allow 4-byte reads uint8_t framebuffer[ROW_COUNT * COL_COUNT] __attribute__((aligned(32))) = {0}; +uint32_t ledBuffer[8][ROW_COUNT * COL_MODULES] = {0}; void leds_init() { memset(framebuffer, 0, sizeof(framebuffer)); @@ -104,7 +105,7 @@ void leds_render() { for (int yCount = 0; yCount < ROW_COUNT; yCount++) { int y = ROW_COUNT - 1 - yCount; - // brigthness - pushing data takes 40us, so to maximize brightness (at high brightness phases) + // brigthness - pushing data takes time, so to maximize brightness (at high brightness phases) // we want to keep the matrix on during update (except during latch). At low brightness phases, // we want it off to actually be dim bool brightPhase = brightnessPhase >= 2; @@ -132,46 +133,23 @@ void leds_render() { // silly shit // TODO: Some ideas for future optimization: // - see if we can disable px pusher delays on improved electric interface - // - use a profiler to see how the inner loop can be improved - // - do the shift register bullshit once per frame, so that data can be loaded into - // registers with aligned access, DMA, etc. // - improve outer loop which adds 2us of processing on each loop // - change busy wait into some kind of interrupt-based thing so that processing can continue // - latch row and clock simultaneously, avoid disabling output - uint8_t *buffer = framebuffer + (y * COL_COUNT); + // - DMA? for (int xModule = 0; xModule < COL_MODULES; xModule++) { - uint32_t pxValues; - - // placeholder at 0; pixels 0, 1, 2 - pxValues = *(reinterpret_cast(buffer)); - pxValues = pxValues << 8; - pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase); - - // pixels 3, 4, 5, placeholder at 6 - pxValues = *(reinterpret_cast(buffer + 3)); - pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase); - - // pixels 6, 7, 8, 9 - pxValues = *(reinterpret_cast(buffer + 6)); - pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase); - - // pixels 10, 11, 12, placeholder at 13 - pxValues = *(reinterpret_cast(buffer + 10)); - pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase); - - // pixels 13, 14, 15, 16 - pxValues = *(reinterpret_cast(buffer + 13)); - pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase); - - // pixels 17, 18, 19, placeholder - pxValues = *(reinterpret_cast(buffer + 17)); - pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase); - - buffer += 20; + uint32_t pxValues = ledBuffer[brightnessPhase + 3][y * COL_MODULES + xModule]; + pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues); } // wait for all data to be shifted out - pio_sm_drain_tx_fifo(pusher_pio, pusher_sm); + while (!pio_sm_is_tx_fifo_empty(pusher_pio, pusher_sm)) { + tight_loop_contents(); + } + // TODO: Is there an API to wait for PIO to actually become idle? + // pio_sm_drain_tx_fifo doesn't seem to do the trick + // if not, we might need to use irqs or something + busy_wait_us(4); // disable columns before latch outputEnable(ROW_OE, false); @@ -201,7 +179,7 @@ void leds_initPusher() { uint latchPin = COL_SRCLK; pio_sm_config config = leds_px_pusher_program_get_default_config(offset); - sm_config_set_clkdiv_int_frac(&config, 2, 0); + sm_config_set_clkdiv_int_frac(&config, 1, 0); // Shift OSR to the right, autopull sm_config_set_out_shift(&config, true, true, 32); diff --git a/firmware/src/leds.h b/firmware/src/leds.h index b421094..8d28034 100644 --- a/firmware/src/leds.h +++ b/firmware/src/leds.h @@ -31,5 +31,6 @@ void leds_loop(); void leds_render(); extern uint8_t framebuffer[ROW_COUNT * COL_COUNT]; +extern uint32_t ledBuffer[8][ROW_COUNT * COL_MODULES]; #endif diff --git a/firmware/src/leds.pio b/firmware/src/leds.pio index 554e96b..34c47d5 100644 --- a/firmware/src/leds.pio +++ b/firmware/src/leds.pio @@ -1,9 +1,21 @@ .program leds_px_pusher .side_set 1 opt -.wrap_target public entry_point: - out null, 3 side 0 [0] ; ignore least significant digits - out pins, 1 ; set bit (shifted for brightness phase by C code) - out null, 4 side 1 [1] ; ignore remaining bits, latch data, allow time for latching - nop side 0 ; return to 0 (weird glitches happen otherwise) +.wrap_target + ; get 32 bits from fifo (not required with autopull, useful for debug) + ; pull + ; push 24 bits to the shift registers + ; also, return latch bit to 0 + set x, 23 side 0 + ; ignore the 8 least significant bits + out null, 8 +loop: + ; lower clock edge + nop side 0 + ; set bit + out pins, 1 + ; loop; latch bit (rising edge) + ; TODO: check if this delay can be lowered with a PCB + jmp x-- loop side 1 [2] +end: .wrap diff --git a/firmware/src/leds.pio.h b/firmware/src/leds.pio.h index 12b6992..cf7fe9a 100644 --- a/firmware/src/leds.pio.h +++ b/firmware/src/leds.pio.h @@ -13,23 +13,24 @@ // -------------- // #define leds_px_pusher_wrap_target 0 -#define leds_px_pusher_wrap 3 +#define leds_px_pusher_wrap 4 #define leds_px_pusher_offset_entry_point 0u static const uint16_t leds_px_pusher_program_instructions[] = { // .wrap_target - 0x7063, // 0: out null, 3 side 0 - 0x6001, // 1: out pins, 1 - 0x7964, // 2: out null, 4 side 1 [1] - 0xb042, // 3: nop side 0 + 0xf037, // 0: set x, 23 side 0 + 0x6068, // 1: out null, 8 + 0xb042, // 2: nop side 0 + 0x6001, // 3: out pins, 1 + 0x1a42, // 4: jmp x--, 2 side 1 [2] // .wrap }; #if !PICO_NO_HARDWARE static const struct pio_program leds_px_pusher_program = { .instructions = leds_px_pusher_program_instructions, - .length = 4, + .length = 5, .origin = -1, };