lanai: stall pipeline on memory stall, bypass memory read delay into compute

main
q3k 2021-10-14 18:33:47 +02:00
parent 86cea357a3
commit 1c37f0c695
10 changed files with 273 additions and 171 deletions

View File

@ -20,8 +20,8 @@ interface ESP32;
endinterface
(* synthesize *)
module mkMemory(Lanai_Memory#(4096));
Lanai_Memory#(4096) inner <- mkBlockMemory("boards/ulx3s/bram.bin");
module mkMemory(Lanai_Memory#(1024));
Lanai_Memory#(1024) inner <- mkBlockMemory("boards/ulx3s/bram.bin");
interface dmem = inner.dmem;
interface imem = inner.imem;
endmodule
@ -30,7 +30,7 @@ endmodule
module mkTop (Top);
GSR gsr <- mkGSR;
Lanai_Memory#(4096) mem <- mkMemory;
let mem <- mkMemory;
Lanai_IFC cpu <- mkLanaiCPU;
mkConnection(cpu.imem_client, mem.imem);

View File

@ -13,14 +13,14 @@
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
05810000
04800200
95240000
85a50004
03ac0001
03ac0002
03ac0003
03ac0004
00000000
00000000
00000000

View File

@ -3,6 +3,7 @@ package CPU_Compute;
import GetPut :: *;
import DReg :: *;
import FIFO :: *;
import FIFOF :: *;
import Probe :: *;
import SpecialFIFOs :: *;
@ -13,143 +14,169 @@ interface CPU_Compute;
interface Put #(FetchToCompute) fetch;
interface Get #(ComputeToMemory) memory;
interface ComputedPC pc;
interface RegisterWriteBypass memoryBypass;
endinterface
function Word resolveRegister(Register ix, Word pc, Maybe#(Tuple2#(Register, Word)) bypass, RegisterRead rs);
Maybe#(Word) fromPC = tagged Invalid;
Maybe#(Word) fromBypass = tagged Invalid;
if (ix == PC) begin
fromPC = tagged Valid pc;
end
if (bypass matches tagged Valid { .bix, .bval }) begin
if (ix == bix) begin
fromBypass = tagged Valid bval;
end
end
return case (tuple2(fromBypass, fromPC)) matches
{ tagged Valid .bval, .* }: bval;
{ tagged Invalid, tagged Valid .pval }: pval;
{ tagged Invalid, tagged Invalid }: rs.read(ix);
endcase;
endfunction
module mkCPUCompute #( RegisterRead rs1
, RegisterRead rs2
, StatusWordRead rsw
, RegisterWriteCompute rwr
) (CPU_Compute);
FIFO#(FetchToCompute) q <- mkPipelineFIFO;
FIFO#(ComputeToMemory) out <- mkBypassFIFO;
FIFOF#(FetchToCompute) q <- mkPipelineFIFOF;
Reg#(Word) computedPC <- mkWire;
RWire#(Tuple2#(Register, Word)) regFromMemory <- mkRWire;
let busy <- mkPulseWire;
let busyPut <- mkPulseWire;
let busyProbe <- mkProbe;
let busyPutProbe <- mkProbe;
let fullQ <- mkProbe;
let instrProbe <- mkProbe;
let pcProbe <- mkProbe;
let eaProbe <- mkProbe;
ALU_IFC alu1 <- mkALU;
//ALU_IFC alu2 <- mkALU;
rule execute;
busyProbe <= True;
q.deq;
let instr = q.first.instr;
let instrPC = q.first.pc;
let runAlu = q.first.runAlu;
let destination = q.first.rd;
instrProbe <= instr;
pcProbe <= instrPC;
StatusWord sw = rsw.read;
// Optimization: always read source1/source2 regs, as they use the same opcode positions.
let rs1v = (q.first.rs1 == PC) ? instrPC : rs1.read(q.first.rs1);
let rs2v = (q.first.rs2 == PC) ? instrPC : rs2.read(q.first.rs2);
// Optimization: always build arithmetic op.
let aluOp = AluOperation { a: rs1v
, b: 0
, shiftArithmetic: False
, addCarry: sw.carry
, kind: q.first.aluOpKind
, condition: False
};
Bool flags = False;
Maybe#(Tuple2#(Register, Word)) mrd = tagged Invalid;
Maybe#(StatusWord) msw = tagged Invalid;
case (instr) matches
tagged RI .ri: begin
let shift = ri.high ? 16 : 0;
let czshift = zeroExtend(ri.constant) << shift;
let coshift = ri.high ? { ri.constant, 16'hFFFF } : { 16'hFFFF, ri.constant };
aluOp.b = czshift;
aluOp.shiftArithmetic = ri.high;
case (ri.operation) matches
Add: begin
aluOp.addCarry = False;
end
Sub: begin
aluOp.addCarry = True;
end
And: begin
aluOp.b = coshift;
end
Shift: begin
aluOp.b = signExtend(ri.constant);
end
endcase
flags = ri.flags;
end
tagged RR .rr: begin
aluOp.b = rs2v;
case (rr.operation) matches
Add: begin
aluOp.addCarry = False;
end
Sub: begin
aluOp.addCarry = True;
end
AShift: begin
aluOp.shiftArithmetic = True;
end
Select: begin
aluOp.condition = evaluateCondition(rr.condition, sw);
$display("eval cond code: ", rr.condition, ", sw: ", fshow(sw), ", res: ", aluOp.condition);
end
endcase
flags = rr.flags;
end
tagged RM .rm: begin
Word added = (rs1v + signExtend(rm.constant))[31:0];
Word ea = rm.p ? added : rs1v;
if (rm.q) begin
mrd = tagged Valid tuple2(q.first.rs1, added);
end
out.enq(ComputeToMemory { ea: ea
, store: rm.store
, value: rs2v
, rd: destination });
end
endcase
if (runAlu) begin
let aluRes <- alu1.run(aluOp);
if (destination == PC) begin
computedPC <= aluRes.result;
end else begin
mrd = tagged Valid tuple2(destination, aluRes.result);
end
if (flags) begin
msw = tagged Valid aluRes.sw;
end
end
rwr.write(msw, mrd);
rule updateBusy;
busyProbe <= busy;
endrule
(* preempts = "execute, noExecute" *)
rule noExecute;
busyProbe <= False;
rule updateBusyPut;
busyPutProbe <= busyPut;
endrule
rule updateFullQ;
fullQ <= !q.notFull;
endrule
interface Put fetch;
method put = q.enq;
endinterface
interface Get memory;
method ActionValue#(ComputeToMemory) get();
out.deq;
let o = out.first;
eaProbe <= o.ea;
return o;
let res = ComputeToMemory { ea: 0, op: tagged Noop };
busy.send();
q.deq;
let instr = q.first.instr;
let instrPC = q.first.pc;
let runAlu = q.first.runAlu;
let destination = q.first.rd;
instrProbe <= instr;
pcProbe <= instrPC;
StatusWord sw = rsw.read;
// Optimization: always read source1/source2 regs, as they use the same opcode positions.
let rs1v = resolveRegister(q.first.rs1, instrPC, regFromMemory.wget(), rs1);
let rs2v = resolveRegister(q.first.rs2, instrPC, regFromMemory.wget(), rs2);
// Optimization: always build arithmetic op.
let aluOp = AluOperation { a: rs1v
, b: 0
, shiftArithmetic: False
, addCarry: sw.carry
, kind: q.first.aluOpKind
, condition: False
};
Bool flags = False;
Maybe#(Tuple2#(Register, Word)) mrd = tagged Invalid;
Maybe#(StatusWord) msw = tagged Invalid;
case (instr) matches
tagged RI .ri: begin
let shift = ri.high ? 16 : 0;
let czshift = zeroExtend(ri.constant) << shift;
let coshift = ri.high ? { ri.constant, 16'hFFFF } : { 16'hFFFF, ri.constant };
aluOp.b = czshift;
aluOp.shiftArithmetic = ri.high;
case (ri.operation) matches
Add: begin
aluOp.addCarry = False;
end
Sub: begin
aluOp.addCarry = True;
end
And: begin
aluOp.b = coshift;
end
Shift: begin
aluOp.b = signExtend(ri.constant);
end
endcase
flags = ri.flags;
end
tagged RR .rr: begin
aluOp.b = rs2v;
case (rr.operation) matches
Add: begin
aluOp.addCarry = False;
end
Sub: begin
aluOp.addCarry = True;
end
AShift: begin
aluOp.shiftArithmetic = True;
end
Select: begin
aluOp.condition = evaluateCondition(rr.condition, sw);
$display("eval cond code: ", rr.condition, ", sw: ", fshow(sw), ", res: ", aluOp.condition);
end
endcase
flags = rr.flags;
end
tagged RM .rm: begin
Word added = (rs1v + signExtend(rm.constant))[31:0];
Word ea = rm.p ? added : rs1v;
if (rm.q) begin
mrd = tagged Valid tuple2(q.first.rs1, added);
end
res.ea = ea;
res.op = (rm.store ? tagged Store rs2v : tagged Load destination);
end
endcase
if (runAlu) begin
let aluRes <- alu1.run(aluOp);
if (destination == PC) begin
computedPC <= aluRes.result;
end else begin
mrd = tagged Valid tuple2(destination, aluRes.result);
end
if (flags) begin
msw = tagged Valid aluRes.sw;
end
end
rwr.write(msw, mrd);
return res;
endmethod
endinterface
interface Put fetch;
method Action put(FetchToCompute v);
busyPut.send();
q.enq(v);
endmethod
endinterface
@ -158,6 +185,12 @@ module mkCPUCompute #( RegisterRead rs1
return computedPC;
endmethod
endinterface
interface RegisterWriteBypass memoryBypass;
method Action strobe(Register ix, Word value);
regFromMemory.wset(tuple2(ix, value));
endmethod
endinterface
endmodule
endpackage

View File

@ -283,6 +283,11 @@ interface RegisterWriteMemory;
method Action write(Register rd, Word value);
endinterface
interface RegisterWriteBypass;
(* always_ready *)
method Action strobe(Register ix, Word value);
endinterface
typedef enum {
Add, Sub, And, Or, Xor, Shift, Select
} AluOperationKind deriving (Bits);
@ -326,13 +331,15 @@ typedef struct {
AluOperationKind aluOpKind;
} FetchToCompute deriving (Bits);
typedef union tagged {
void Noop;
Word Store;
Register Load;
} ComputeToMemoryOp deriving (Bits);
typedef struct {
Word ea;
Bool store;
// If 'store', the value to store at ea;
Word value;
// If not 'store', the register to which read ea.
Register rd;
ComputeToMemoryOp op;
} ComputeToMemory deriving (Bits);
interface ComputedPC;

View File

@ -22,19 +22,35 @@ module mkCPUFetch #( RegisterRead pcRead
FIFO#(FetchToCompute) out <- mkBypassFIFO;
Reg#(Word) fetched <- mkConfigReg(0);
Reg#(Word) pc <- mkReg(0);
FIFO#(Word) waitRead <- mkPipelineFIFO;
PulseWire busyPredict <- mkPulseWire;
PulseWire busyCorrectCompute <- mkPulseWire;
PulseWire busyComputeGet <- mkPulseWire;
let instructionProbe <- mkProbe;
let pcProbe <- mkProbe;
let busyPredictProbe <- mkProbe;
let busyCorrectComputeProbe <- mkProbe;
let busyComputeGetProbe <- mkProbe;
rule updateProbes;
busyPredictProbe <= busyPredict;
busyCorrectComputeProbe <= busyCorrectCompute;
busyComputeGetProbe <= busyComputeGet;
endrule
Reg#(Word) nextPC <- mkWire;
Reg#(Word) fetchPC <- mkWire;
rule updatePCPredict;
busyPredict.send();
nextPC <= pc + 4;
fetchPC <= pc;
endrule
(* preempts = "updatePCCompute, updatePCPredict" *)
rule updatePCCompute;
busyCorrectCompute.send();
let val = pcFromCompute.get;
if (fetched != val) begin
nextPC <= val + 4;
@ -52,11 +68,15 @@ module mkCPUFetch #( RegisterRead pcRead
method ActionValue#(Word) get;
pc <= nextPC;
fetched <= fetchPC;
waitRead.enq(fetchPC);
return fetchPC;
endmethod
endinterface
interface Put response;
method Action put(Word data);
waitRead.deq;
let pc = waitRead.first;
Instruction instr = unpack(data);
Bool runAlu = case(instr) matches
tagged RI .ri: True;
@ -68,7 +88,7 @@ module mkCPUFetch #( RegisterRead pcRead
default: unpack(data[15:11]);
endcase;
out.enq(FetchToCompute { instr: instr
, pc: fetched
, pc: pc
, rs1: unpack(data[22:18])
, rs2: rs2
, rd: unpack(data[27:23])
@ -81,8 +101,10 @@ module mkCPUFetch #( RegisterRead pcRead
interface Get compute;
method ActionValue#(FetchToCompute) get();
busyComputeGet.send();
out.deq;
let o = out.first;
instructionProbe <= o.instr;
pcProbe <= o.pc;
return o;

View File

@ -16,35 +16,41 @@ interface CPU_Memory;
endinterface
typedef struct {
Register rd;
Maybe#(Register) rd;
} WaitReadResponse deriving (Bits);
module mkCPUMemory #( RegisterWriteMemory rwr
, RegisterWriteBypass bypass
) (CPU_Memory);
FIFO #(ComputeToMemory) q <- mkPipelineFIFO;
FIFOF #(ComputeToMemory) q <- mkPipelineFIFOF;
FIFOF #(WaitReadResponse) waitRead <- mkPipelineFIFOF;
PulseWire busyReq <- mkPulseWire;
PulseWire busyResp <- mkPulseWire;
PulseWire busyPut <- mkPulseWire;
let busyReqProbe <- mkProbe;
let busyRespProbe <- mkProbe;
let writeStallProbe <- mkProbe;
let busyPutProbe <- mkProbe;
let fullQ <- mkProbe;
let fullWaitRead <- mkProbe;
let eaProbe <- mkProbe;
let storeProbe <- mkProbe;
let valueProbe <- mkProbe;
let rdProbe <- mkProbe;
let responseRegProbe <- mkProbe;
rule updateBusyProbe;
busyReqProbe <= busyReq;
busyRespProbe <= busyResp;
endrule
rule updateStallProbe;
writeStallProbe <= waitRead.notEmpty;
rule updateBusyPutProbe;
busyPutProbe <= busyPut;
endrule
rule updateFullQ;
fullQ <= !q.notFull;
endrule
rule updateFullWaitRead;
fullWaitRead <= !waitRead.notFull;
endrule
interface Client dmem;
@ -54,36 +60,46 @@ module mkCPUMemory #( RegisterWriteMemory rwr
q.deq;
eaProbe <= q.first.ea;
storeProbe <= q.first.store;
valueProbe <= q.first.value;
rdProbe <= q.first.rd;
if (q.first.store == False) begin
waitRead.enq(WaitReadResponse { rd: q.first.rd });
end
Maybe#(Word) data = tagged Invalid;
case (q.first.op) matches
tagged Noop: begin
waitRead.enq(WaitReadResponse { rd: tagged Invalid });
end
tagged Load .rd: begin
waitRead.enq(WaitReadResponse { rd: tagged Valid rd });
end
tagged Store .d: begin
waitRead.enq(WaitReadResponse { rd: tagged Invalid });
data = tagged Valid d;
end
endcase
return DMemReq { addr: q.first.ea
, data: case (q.first.store) matches
True: tagged Valid q.first.value;
False: tagged Invalid;
endcase
, data: data
};
endmethod
endinterface
interface Put response;
method Action put(Word resp);
busyResp.send();
waitRead.deq;
responseRegProbe <= waitRead.first.rd;
rwr.write(waitRead.first.rd, resp);
if (waitRead.first.rd matches tagged Valid .rd) begin
busyResp.send();
responseRegProbe <= rd;
rwr.write(rd, resp);
bypass.strobe(rd, resp);
end
endmethod
endinterface
endinterface
interface Put compute;
method put = q.enq;
method Action put(ComputeToMemory v);
busyPut.send();
q.enq(v);
endmethod
endinterface
endmodule

View File

@ -21,8 +21,9 @@ module mkLanaiCPU (Lanai_IFC);
CPU_Fetch fetch <- mkCPUFetch( rf.fetchRead
, compute.pc
);
CPU_Memory memory <- mkCPUMemory( rf.memoryWrite
);
CPU_Memory memory <- mkCPUMemory( rf.memoryWrite
, compute.memoryBypass
);
mkConnection(fetch.compute, compute.fetch);
mkConnection(compute.memory, memory.compute);

View File

@ -1,6 +1,8 @@
package Lanai_Memory;
import BRAM :: *;
import FIFO :: *;
import SpecialFIFOs :: *;
import CPU_Defs :: *;
import Lanai_IFC :: *;
@ -19,13 +21,21 @@ module mkBlockMemory#(String filename) (Lanai_Memory#(k)) provisos (Log#(k, n));
cfg.allowWriteResponseBypass = False;
BRAM2Port#(Bit#(n), Bit#(32)) bram <- mkBRAM2Server(cfg);
FIFO#(BRAMRequest#(Bit#(n), Bit#(32))) delayFIFO <- mkPipelineFIFO;
let nwords = valueOf(n);
rule delayed_dmem;
delayFIFO.deq;
let breq = delayFIFO.first;
bram.portB.request.put(breq);
endrule
interface Server imem;
interface Put request;
method Action put(Word addr);
bram.portA.request.put(BRAMRequest { write: False
, responseOnWrite: False
, responseOnWrite: True
, address: addr[nwords+1:2]
, datain: 0
});
@ -38,11 +48,15 @@ module mkBlockMemory#(String filename) (Lanai_Memory#(k)) provisos (Log#(k, n));
interface Put request;
method Action put(DMemReq req);
let breq = BRAMRequest { write: isValid(req.data)
, responseOnWrite: False
, responseOnWrite: True
, address: req.addr[nwords+1:2]
, datain: fromMaybe(0, req.data)
};
bram.portB.request.put(breq);
if (req.addr == 256 || req.addr == 0) begin
bram.portB.request.put(breq);
end else begin
delayFIFO.enq(breq);
end
endmethod
endinterface
interface Get response = bram.portB.response;

View File

@ -13,14 +13,14 @@
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
00000000
05810000
04800200
95240000
85a50004
03ac0001
03ac0002
03ac0003
03ac0004
00000000
00000000
00000000

View File

@ -238,6 +238,15 @@ insns = {
24: 'add %r11, 3, %r7',
28: 'add %r11, 4, %r7',
60: 'add %r0, 0, %r11',
64: 'add %r0, 512, %r9',
68: 'st %r10, 0[%r9]',
72: 'ld 0[%r9++], %r11',
76: 'add %r11, 1, %r7',
80: 'add %r11, 2, %r7',
84: 'add %r11, 3, %r7',
88: 'add %r11, 4, %r7',
256: 'add %r0, 128, %pc',
}