1
0
Fork 0

Compare commits

...

9 Commits

Author SHA1 Message Date
radex 3880a58137
Revert "optimize: use PWM interrupt based delays for precision and pipelining"
This reverts commit 9f9d089e08.
2024-05-27 13:35:38 +02:00
radex 647224897e
Revert "wip optimize"
This reverts commit fa115e88cb.
2024-05-27 13:35:34 +02:00
radex fa115e88cb
wip optimize 2024-05-27 13:35:22 +02:00
radex 9f9d089e08
optimize: use PWM interrupt based delays for precision and pipelining 2024-05-27 10:15:22 +02:00
radex 2a953464fd
optimize: use irq to signal readiness 2024-05-26 23:02:06 +02:00
radex fbbcf0746c
optimize: rclk via pio 2024-05-26 22:01:38 +02:00
radex 589f8a96ae
some microoptimizations 2024-05-26 17:30:40 +02:00
radex 5e368cf5d3
documentation 2024-05-26 16:51:17 +02:00
radex 5a34b7d99a
optimize leds stage 2024-05-26 16:51:08 +02:00
6 changed files with 166 additions and 104 deletions

View File

@ -40,6 +40,46 @@ int32_t gfx_decoder_loadNextFrame() {
return false;
}
// Convert framebuffer into raw shift register data for fast PIO pixel pushing
// Data will be held in buffers, one per pixel's depth bit (aka brightness stage),
// with each row split into 32-bit chunks, one per module
// (20 pixels, 24 shift register stages, 8 unused bits)
// Rows are inverted, because that's how they're fed to the shift registers
// TODO: Move this to leds.cpp
// TODO: Use a separate buffer, then copy to ledsBuffer to avoid tearing
for (int bi = 0; bi < 8; bi++) {
uint8_t bitPosition = 1 << bi;
for (int y = 0; y < ROW_COUNT; y++) {
auto yOffset = y * COL_COUNT;
for (int xModule = 0; xModule < COL_MODULES; xModule++) {
auto bufferXOffset = yOffset + xModule * 20;
uint32_t sample = 0;
for (int x = 0; x < 20; x++) {
// insert placeholders for unused stages
// (before pixels 0, 6, 13)
if (x == 0 || x == 6 || x == 13) {
sample >>= 1;
}
uint8_t px = buffer[bufferXOffset + x];
bool bit = px & bitPosition;
sample = (sample >> 1) | (bit ? 0x80000000 : 0);
}
// insert placeholder for unused last stage (after pixel 19)
sample >>=1;
// shift to LSB position
sample >>=8;
// MSB=1 indicates end of row
if (xModule == COL_MODULES - 1) {
sample |= 0x80000000;
}
ledBuffer[bi][(ROW_COUNT - 1 - y) * COL_MODULES + xModule] = sample;
}
}
}
ledBufferReady = true;
// copy to framebuffer
// TODO: mutex? double buffer? or something...
memcpy(framebuffer, buffer, ROW_COUNT * COL_COUNT);

View File

@ -16,8 +16,6 @@ inline void pulsePin(uint8_t pin) {
// there are glitches without this (maybe just due to breadboard...)
_NOP();
_NOP();
_NOP();
// busy_wait_us_32(50);
gpio_put(pin, LOW);
}
@ -38,6 +36,8 @@ uint8_t brightnessPhaseDelays[COLOR_BITS] = {0, 1, 6, 20, 60};
// NOTE: Alignment required to allow 4-byte reads
uint8_t framebuffer[ROW_COUNT * COL_COUNT] __attribute__((aligned(32))) = {0};
bool ledBufferReady = false;
uint32_t ledBuffer[8][ROW_COUNT * COL_MODULES] = {0};
void leds_init() {
memset(framebuffer, 0, sizeof(framebuffer));
@ -50,7 +50,8 @@ void leds_init() {
pinMode(COL_SER, OUTPUT);
pinMode(COL_OE, OUTPUT);
outputEnable(ROW_OE, false);
pinMode(COL_RCLK, OUTPUT);
// pinMode(COL_RCLK, OUTPUT);
pinMode(RCLK, OUTPUT);
pinMode(COL_SRCLK, OUTPUT);
pinMode(COL_SRCLR, OUTPUT);
@ -58,18 +59,18 @@ void leds_init() {
pinMode(ROW_SER, OUTPUT);
pinMode(ROW_OE, OUTPUT);
outputEnable(ROW_OE, false);
pinMode(ROW_RCLK, OUTPUT);
// pinMode(ROW_RCLK, OUTPUT);
pinMode(ROW_SRCLK, OUTPUT);
pinMode(ROW_SRCLR, OUTPUT);
// clear output - cols
clearShiftReg(COL_SRCLK, COL_SRCLR);
pulsePin(COL_RCLK);
outputEnable(COL_OE, true);
pulsePin(RCLK);
outputEnable(COL_OE, true); // this is fine, because we control OE via rows only
// clear output - rows
clearShiftReg(ROW_SRCLK, ROW_SRCLR);
pulsePin(ROW_RCLK);
pulsePin(RCLK);
}
void leds_disable() {
@ -93,6 +94,15 @@ void leds_initRenderer() {
}
void leds_render() {
if (!ledBufferReady) {
outputEnable(ROW_OE, false);
return;
}
// brightness phase
bool brightPhase = brightnessPhase >= 3;
auto buffer = ledBuffer[brightnessPhase + 3];
// hide output
outputEnable(ROW_OE, false);
@ -102,88 +112,54 @@ void leds_render() {
// start selecting rows
gpio_put(ROW_SER, HIGH);
for (int yCount = 0; yCount < ROW_COUNT; yCount++) {
int y = ROW_COUNT - 1 - yCount;
// brigthness - pushing data takes 40us, so to maximize brightness (at high brightness phases)
// we want to keep the matrix on during update (except during latch). At low brightness phases,
// we want it off to actually be dim
bool brightPhase = brightnessPhase >= 2;
outputEnable(ROW_OE, brightPhase);
int bufferOffset = 0;
for (int yModule = 0; yModule < ROW_MODULES; yModule++) {
for (int moduleY = 0; moduleY < 20; moduleY++) {
// brigthness - pushing data takes time, so to maximize brightness (at high brightness phases)
// we want to keep the matrix on during update (except during latch). At low brightness phases,
// we want it off to actually be dim
outputEnable(ROW_OE, brightPhase);
// next row
pulsePin(ROW_SRCLK);
// only one row
gpio_put(ROW_SER, LOW);
// we use 7/8 stages on shift registers + 1 is unused
int moduleY = yCount % 20;
if (moduleY == 0) {
// next row
pulsePin(ROW_SRCLK);
// only one row
gpio_put(ROW_SER, LOW);
// we use 7/8 stages on shift registers + 1 is unused
if (moduleY == 0) {
pulsePin(ROW_SRCLK);
}
if (moduleY == 7 || moduleY == 14 || (moduleY == 0 && yModule != 0)) {
pulsePin(ROW_SRCLK);
}
// set row data using PIO
// latch signal is also sent here
// TODO: Some ideas for future optimization:
// - see if we can disable px pusher delays on improved electric interface
// - improve outer loop which adds 2us of processing on each loop
// - change busy wait into some kind of interrupt-based thing so that processing can continue
// - DMA?
for (int xModule = 0; xModule < COL_MODULES; xModule++) {
uint32_t pxValues = buffer[bufferOffset + xModule];
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues);
}
// wait until pushing and RCLK latch are done
while (!pio_interrupt_get(pusher_pio, 0)) {
tight_loop_contents();
}
pio_interrupt_clear(pusher_pio, pusher_sm);
// show for a certain period
outputEnable(ROW_OE, true);
busy_wait_us_32(brightnessPhaseDelays[brightnessPhase]);
outputEnable(ROW_OE, false);
// next row
bufferOffset += COL_MODULES;
}
if (moduleY == 7 || moduleY == 14 || (moduleY == 0 && yCount != 0)) {
pulsePin(ROW_SRCLK);
}
// set row data
// NOTE: values are loaded right-left
// Optimized implementation: use PIO, avoid division, modulo, etc...
// we use 7/8 stages of each shift register + 1 is unused so we need to do
// silly shit
// TODO: Some ideas for future optimization:
// - see if we can disable px pusher delays on improved electric interface
// - use a profiler to see how the inner loop can be improved
// - do the shift register bullshit once per frame, so that data can be loaded into
// registers with aligned access, DMA, etc.
// - improve outer loop which adds 2us of processing on each loop
// - change busy wait into some kind of interrupt-based thing so that processing can continue
// - latch row and clock simultaneously, avoid disabling output
uint8_t *buffer = framebuffer + (y * COL_COUNT);
for (int xModule = 0; xModule < COL_MODULES; xModule++) {
uint32_t pxValues;
// placeholder at 0; pixels 0, 1, 2
pxValues = *(reinterpret_cast<uint32_t *>(buffer));
pxValues = pxValues << 8;
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
// pixels 3, 4, 5, placeholder at 6
pxValues = *(reinterpret_cast<uint32_t *>(buffer + 3));
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
// pixels 6, 7, 8, 9
pxValues = *(reinterpret_cast<uint32_t *>(buffer + 6));
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
// pixels 10, 11, 12, placeholder at 13
pxValues = *(reinterpret_cast<uint32_t *>(buffer + 10));
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
// pixels 13, 14, 15, 16
pxValues = *(reinterpret_cast<uint32_t *>(buffer + 13));
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
// pixels 17, 18, 19, placeholder
pxValues = *(reinterpret_cast<uint32_t *>(buffer + 17));
pio_sm_put_blocking(pusher_pio, pusher_sm, pxValues >> brightnessPhase);
buffer += 20;
}
// wait for all data to be shifted out
pio_sm_drain_tx_fifo(pusher_pio, pusher_sm);
// disable columns before latch
outputEnable(ROW_OE, false);
// latch rows and columns
pulsePin(ROW_RCLK);
pulsePin(COL_RCLK);
// show for a certain period
outputEnable(ROW_OE, true);
busy_wait_us_32(brightnessPhaseDelays[brightnessPhase]);
outputEnable(ROW_OE, false);
}
// next brightness phase
@ -201,7 +177,7 @@ void leds_initPusher() {
uint latchPin = COL_SRCLK;
pio_sm_config config = leds_px_pusher_program_get_default_config(offset);
sm_config_set_clkdiv_int_frac(&config, 2, 0);
sm_config_set_clkdiv_int_frac(&config, 1, 0);
// Shift OSR to the right, autopull
sm_config_set_out_shift(&config, true, true, 32);
@ -214,11 +190,16 @@ void leds_initPusher() {
// data is inverted
gpio_set_outover(dataPin, GPIO_OVERRIDE_INVERT);
// Set SET (latch) pin, connect to pad, set as output
// Set sideset (SRCLK) pin, connect to pad, set as output
sm_config_set_sideset_pins(&config, latchPin);
pio_gpio_init(pio, latchPin);
pio_sm_set_consecutive_pindirs(pio, sm, latchPin, 1, true);
// Set SET (RCLK) pin, connect to pad, set as output
sm_config_set_set_pins(&config, RCLK, 1);
pio_gpio_init(pio, RCLK);
pio_sm_set_consecutive_pindirs(pio, sm, RCLK, 1, true);
// Load our configuration, and jump to the start of the program
pio_sm_init(pio, sm, offset, &config);
pio_sm_set_enabled(pio, sm, true);

View File

@ -6,17 +6,19 @@
#define COL_SER 20
#define COL_OE 21
#define COL_RCLK 22
// #define COL_RCLK 22
#define RCLK 22
#define COL_SRCLK 26
#define COL_SRCLR 27
#define ROW_SER 14
#define ROW_OE 13
#define ROW_RCLK 12
// #define ROW_RCLK 12
#define ROW_SRCLK 11
#define ROW_SRCLR 10
#define ROW_COUNT 40
#define ROW_MODULES 2
#define ROW_COUNT ROW_MODULES * 20
#define COL_MODULES 2
#define COL_COUNT COL_MODULES * 20
@ -31,5 +33,7 @@ void leds_loop();
void leds_render();
extern uint8_t framebuffer[ROW_COUNT * COL_COUNT];
extern bool ledBufferReady;
extern uint32_t ledBuffer[8][ROW_COUNT * COL_MODULES];
#endif

View File

@ -1,9 +1,31 @@
.program leds_px_pusher
.side_set 1 opt
.wrap_target
public entry_point:
out null, 3 side 0 [0] ; ignore least significant digits
out pins, 1 ; set bit (shifted for brightness phase by C code)
out null, 4 side 1 [1] ; ignore remaining bits, latch data, allow time for latching
nop side 0 ; return to 0 (weird glitches happen otherwise)
.wrap
.wrap_target
; get 32 bits from fifo (not required with autopull, useful for debug)
; pull
; push 24 bits to the shift registers
; also, return latch bit to 0
set x, 23 side 0
loop:
; TODO: check if delays can be lowered with a PCB
; set bit; lower clock edge
out pins, 1 side 0 [1]
; loop; latch bit (rising edge)
jmp x-- loop side 1 [2]
end:
; ignore unused bits
; load MSBs into x
; lower clock edge
out x, 8 side 0
; MSB=1 indicates end of row
jmp x-- end_of_row
.wrap
end_of_row:
; indicate to main processor that row was processed
irq set 0
; clock RCLK (latch onto register output stage)
set pins, 1 [3]
set pins, 0
; wait for next row
jmp entry_point

View File

@ -13,23 +13,28 @@
// -------------- //
#define leds_px_pusher_wrap_target 0
#define leds_px_pusher_wrap 3
#define leds_px_pusher_wrap 4
#define leds_px_pusher_offset_entry_point 0u
static const uint16_t leds_px_pusher_program_instructions[] = {
// .wrap_target
0x7063, // 0: out null, 3 side 0
0x6001, // 1: out pins, 1
0x7964, // 2: out null, 4 side 1 [1]
0xb042, // 3: nop side 0
0xf037, // 0: set x, 23 side 0
0x7101, // 1: out pins, 1 side 0 [1]
0x1a41, // 2: jmp x--, 1 side 1 [2]
0x7028, // 3: out x, 8 side 0
0x0045, // 4: jmp x--, 5
// .wrap
0xc000, // 5: irq nowait 0
0xe301, // 6: set pins, 1 [3]
0xe000, // 7: set pins, 0
0x0000, // 8: jmp 0
};
#if !PICO_NO_HARDWARE
static const struct pio_program leds_px_pusher_program = {
.instructions = leds_px_pusher_program_instructions,
.length = 4,
.length = 9,
.origin = -1,
};

View File

@ -1,3 +1,4 @@
## SD breadboard connector
17
DAT1 GND +3V3 CS SHLD
@ -6,7 +7,16 @@ DAT1 GND +3V3 CS SHLD
DET MISO CLK MOSI DAT2
28 16 18 19
## Big SD card
D2 D3/CS CMD/MOSI VSS1 VDD CLK VSS2 D0/MISO D1
x 17 19 x x 18 x 16 x
## LED drivers
- VCC
- SER
- OE
- RCLK
- SRCLK
- SRCLR