feat(ip): add AXI CDMA IP

This commit is contained in:
2026-04-12 22:25:00 +08:00
parent 14c45b9886
commit 8bad8c21e5
7 changed files with 1300 additions and 0 deletions

View File

@@ -0,0 +1,455 @@
module snix_axi_mm2mm #(parameter int ADDR_WIDTH = 32,
parameter int DATA_WIDTH = 64,
parameter int ID_WIDTH = 4,
parameter int USER_WIDTH = 1,
parameter int FIFO_DEPTH = 16)
(// Global signals
input logic clk,
input logic rst_n,
// Control interface
input logic ctrl_start,
input logic ctrl_stop,
input logic [ADDR_WIDTH-1:0] ctrl_src_addr,
input logic [ADDR_WIDTH-1:0] ctrl_dst_addr,
input logic [7:0] ctrl_len,
input logic [2:0] ctrl_size,
input logic [31:0] ctrl_transfer_len,
output logic ctrl_done,
// AW Channel
output logic [ID_WIDTH-1:0] mm2mm_awid,
output logic [ADDR_WIDTH-1:0] mm2mm_awaddr,
output logic [7:0] mm2mm_awlen,
output logic [2:0] mm2mm_awsize,
output logic [1:0] mm2mm_awburst,
output logic mm2mm_awlock,
output logic [3:0] mm2mm_awcache,
output logic [2:0] mm2mm_awprot,
output logic [3:0] mm2mm_awqos,
output logic [USER_WIDTH-1:0] mm2mm_awuser,
output logic mm2mm_awvalid,
input logic mm2mm_awready,
// W Channel
output logic [DATA_WIDTH-1:0] mm2mm_wdata,
output logic [DATA_WIDTH/8-1:0] mm2mm_wstrb,
output logic mm2mm_wlast,
output logic [USER_WIDTH-1:0] mm2mm_wuser,
output logic mm2mm_wvalid,
input logic mm2mm_wready,
// B Channel
input logic [ID_WIDTH-1:0] mm2mm_bid,
input logic [1:0] mm2mm_bresp,
input logic [USER_WIDTH-1:0] mm2mm_buser,
input logic mm2mm_bvalid,
output logic mm2mm_bready,
// AR Channel
output logic [ID_WIDTH-1:0] mm2mm_arid,
output logic [ADDR_WIDTH-1:0] mm2mm_araddr,
output logic [7:0] mm2mm_arlen,
output logic [2:0] mm2mm_arsize,
output logic [1:0] mm2mm_arburst,
output logic mm2mm_arlock,
output logic [3:0] mm2mm_arcache,
output logic [2:0] mm2mm_arprot,
output logic [3:0] mm2mm_arqos,
output logic [USER_WIDTH-1:0] mm2mm_aruser,
output logic mm2mm_arvalid,
input logic mm2mm_arready,
// R Channel
input logic [ID_WIDTH-1:0] mm2mm_rid,
input logic [DATA_WIDTH-1:0] mm2mm_rdata,
input logic [1:0] mm2mm_rresp,
input logic mm2mm_rlast,
input logic [USER_WIDTH-1:0] mm2mm_ruser,
input logic mm2mm_rvalid,
output logic mm2mm_rready
);
// -------------------------------------------------------------------------
// Local parameters
// -------------------------------------------------------------------------
localparam int STRB_WIDTH = DATA_WIDTH / 8;
localparam int STRB_IDX_WIDTH = $clog2(STRB_WIDTH) + 1; // +1 to hold full-width value
// -------------------------------------------------------------------------
// Start / stop edge detection
// -------------------------------------------------------------------------
logic ctrl_start_r, wr_start_edge;
logic ctrl_stop_r, wr_stop_edge;
always_ff @(posedge clk or negedge rst_n)
if (!rst_n) begin
ctrl_start_r <= 1'b0;
ctrl_stop_r <= 1'b0;
end else begin
ctrl_start_r <= ctrl_start;
ctrl_stop_r <= ctrl_stop;
end
assign wr_start_edge = ctrl_start & ~ctrl_start_r;
assign wr_stop_edge = ctrl_stop & ~ctrl_stop_r;
// -------------------------------------------------------------------------
// Abort latch — set on stop edge, cleared on start edge
// -------------------------------------------------------------------------
logic wr_abort;
always_ff @(posedge clk or negedge rst_n)
if (!rst_n) wr_abort <= 1'b0;
else if (wr_stop_edge) wr_abort <= 1'b1;
else if (wr_start_edge) wr_abort <= 1'b0;
// -------------------------------------------------------------------------
// FSM
// -------------------------------------------------------------------------
typedef enum logic [2:0] {IDLE, PREP1, PREP2, AR, READ, AW, WRITE, WAIT_BRESP} state_t;
state_t state, next_state;
logic transfer_done;
always_ff @(posedge clk or negedge rst_n)
if (!rst_n) state <= IDLE;
else state <= next_state;
always_comb begin
next_state = state; // default holds state; prevents latches
case (state)
IDLE: begin
next_state = (ctrl_start && !wr_abort) ? PREP1 : IDLE;
end
PREP1: begin
next_state = wr_abort ? IDLE : PREP2;
end
PREP2: begin
next_state = AR;
end
AR: begin
if (wr_abort)
next_state = IDLE;
else
next_state = (mm2mm_arvalid && mm2mm_arready) ? READ : AR;
end
READ: begin
next_state = (mm2mm_rvalid && mm2mm_rready && mm2mm_rlast) ? AW : READ;
end
AW: begin
if (wr_abort)
next_state = IDLE;
else
next_state = (mm2mm_awvalid && mm2mm_awready) ? WRITE : AW;
end
WRITE: begin
next_state = (mm2mm_wvalid && mm2mm_wready && mm2mm_wlast) ? WAIT_BRESP : WRITE;
end
WAIT_BRESP: begin
if (mm2mm_bvalid && mm2mm_bready) begin
if (wr_abort || transfer_done)
next_state = IDLE;
else
next_state = PREP1;
end
end
default: ;
endcase
end
// -------------------------------------------------------------------------
// 4K boundary & burst-length computation — pipelined across PREP1 / PREP2
//
// Identical two-stage pipeline to s2mm / mm2s. The src address is used
// for the 4K check; burst_actual_bytes is applied to both pointers so
// they stay in lock-step.
//
// PREP1 [Stage 1]: max_len, next_size, src_axi_addr (all regs)
// → next_bytes → cross_4k → bytes_to_4k
// → register bytes_to_4k_r
//
// PREP2 [Stage 2]: bytes_to_4k_r, pending_bytes, next_size (all regs)
// → num_bytes_comb → next_len_o
// → register next_arlen / next_awlen | burst_actual_bytes
// -------------------------------------------------------------------------
logic [7:0] max_len;
logic [2:0] next_size;
logic [ADDR_WIDTH-1:0] src_axi_addr;
logic [ADDR_WIDTH-1:0] dst_axi_addr;
logic [31:0] pending_bytes;
// Stage 1 wires
logic [14:0] next_bytes;
logic cross_4k;
logic [14:0] bytes_to_4k;
// Stage 1 → Stage 2 pipeline register
logic [14:0] bytes_to_4k_r;
// Stage 2 wires
logic [14:0] num_bytes_comb;
logic [7:0] next_len_o;
// Stage 1 combinatorial
assign next_bytes = compute_num_bytes(max_len, next_size);
assign cross_4k = ({1'b0, next_bytes} + {4'b0, src_axi_addr[11:0]}) >= 16'd4096;
assign bytes_to_4k = cross_4k ? (15'd4096 - {3'b0, src_axi_addr[11:0]}) : next_bytes;
// Stage 2 combinatorial
assign num_bytes_comb = ({{17{1'b0}}, bytes_to_4k_r} <= pending_bytes)
? bytes_to_4k_r
: pending_bytes[14:0];
assign next_len_o = compute_next_len(num_bytes_comb, next_size);
// -------------------------------------------------------------------------
// Transfer-state registers
// -------------------------------------------------------------------------
logic [31:0] transfer_len;
logic [31:0] copied_bytes;
logic [7:0] next_arlen;
logic [7:0] next_awlen;
logic [14:0] burst_actual_bytes;
always_ff @(posedge clk or negedge rst_n)
if (!rst_n) begin
src_axi_addr <= '0;
dst_axi_addr <= '0;
next_arlen <= '0;
next_awlen <= '0;
max_len <= '0;
next_size <= '0;
copied_bytes <= '0;
transfer_len <= '0;
bytes_to_4k_r <= '0;
burst_actual_bytes <= '0;
end else if (wr_start_edge) begin
src_axi_addr <= ctrl_src_addr;
dst_axi_addr <= ctrl_dst_addr;
next_arlen <= ctrl_len;
next_awlen <= ctrl_len;
max_len <= ctrl_len;
next_size <= ctrl_size;
copied_bytes <= '0;
transfer_len <= ctrl_transfer_len;
burst_actual_bytes <= '0;
end else if (state == PREP1) begin
bytes_to_4k_r <= bytes_to_4k;
end else if (state == PREP2) begin
next_arlen <= next_len_o;
next_awlen <= next_len_o;
burst_actual_bytes <= num_bytes_comb;
end else if (state == AR && mm2mm_arready) begin
src_axi_addr <= src_axi_addr + {{(ADDR_WIDTH-15){1'b0}}, burst_actual_bytes};
end else if (state == AW && mm2mm_awready) begin
dst_axi_addr <= dst_axi_addr + {{(ADDR_WIDTH-15){1'b0}}, burst_actual_bytes};
copied_bytes <= copied_bytes + {17'b0, burst_actual_bytes};
end
// pending_bytes — decremented in AR state after burst_actual_bytes is registered.
// TIMING FIX: reg(burst_actual_bytes) → subtractor → reg(pending_bytes)
always_ff @(posedge clk or negedge rst_n)
if (!rst_n)
pending_bytes <= '0;
else if (wr_start_edge)
pending_bytes <= ctrl_transfer_len;
else if (state == AR && mm2mm_arready)
pending_bytes <= pending_bytes - {17'b0, burst_actual_bytes};
// -------------------------------------------------------------------------
// Transfer-done flag
// -------------------------------------------------------------------------
always_ff @(posedge clk or negedge rst_n)
if (!rst_n)
transfer_done <= 1'b0;
else if (wr_start_edge || state == IDLE || state == AR ||
state == PREP1 || state == PREP2)
transfer_done <= 1'b0;
else
transfer_done <= (copied_bytes == transfer_len) && (copied_bytes != '0);
// -------------------------------------------------------------------------
// ctrl_done — single-cycle pulse when FSM transitions into IDLE
// -------------------------------------------------------------------------
always_ff @(posedge clk or negedge rst_n)
if (!rst_n) ctrl_done <= 1'b0;
else ctrl_done <= (next_state == IDLE) && (state != IDLE);
// -------------------------------------------------------------------------
// Beat counter — drives wlast
// -------------------------------------------------------------------------
logic [7:0] beat_cnt;
always_ff @(posedge clk or negedge rst_n)
if (!rst_n)
beat_cnt <= '0;
else if (state == AW && mm2mm_awready)
beat_cnt <= '0;
else if (mm2mm_wvalid && mm2mm_wready)
beat_cnt <= beat_cnt + 1'b1;
// -------------------------------------------------------------------------
// Write strobe generation for partial last beat
//
// partial_strb_mask uses a per-bit comparator loop rather than a hardcoded
// case statement, making it correct for any DATA_WIDTH (including 1024-bit).
// Synthesises as a parallel comparator tree with no barrel shifter.
// -------------------------------------------------------------------------
logic [14:0] bytes_in_burst;
logic [STRB_WIDTH-1:0] wstrb_mask;
logic [STRB_IDX_WIDTH-1:0] valid_bytes;
always_ff @(posedge clk or negedge rst_n)
if (!rst_n)
bytes_in_burst <= '0;
else if (state == AW && mm2mm_awready)
bytes_in_burst <= burst_actual_bytes;
else if (mm2mm_wvalid && mm2mm_wready)
bytes_in_burst <= (bytes_in_burst > STRB_WIDTH[14:0])
? (bytes_in_burst - STRB_WIDTH[14:0])
: '0;
assign valid_bytes = (bytes_in_burst <= STRB_WIDTH[14:0])
? bytes_in_burst[STRB_IDX_WIDTH-1:0]
: STRB_WIDTH[STRB_IDX_WIDTH-1:0];
logic [STRB_WIDTH-1:0] partial_strb_mask;
always_comb begin
for (int i = 0; i < STRB_WIDTH; i++)
partial_strb_mask[i] = (i < valid_bytes);
end
always_comb begin
if (mm2mm_wlast && (bytes_in_burst < STRB_WIDTH[14:0]) && (bytes_in_burst != '0))
wstrb_mask = partial_strb_mask;
else if (bytes_in_burst == '0)
wstrb_mask = '0;
else
wstrb_mask = {STRB_WIDTH{1'b1}};
end
// -------------------------------------------------------------------------
// FIFO — bridges R channel (read path) to W channel (write path)
//
// R-channel data is accepted only in READ state to prevent beats arriving
// before READ is entered from being silently dropped (same fix as mm2s).
// The write side drains only in WRITE state for symmetric gating.
// -------------------------------------------------------------------------
logic [DATA_WIDTH-1:0] fifo_tdata;
logic fifo_tvalid;
logic fifo_tlast;
logic fifo_tuser;
logic fifo_s_tready;
assign mm2mm_rready = fifo_s_tready && (state == READ);
snix_axis_fifo #(
.DATA_WIDTH(DATA_WIDTH),
.FIFO_DEPTH(FIFO_DEPTH)
) axis_fifo_u0 (
.clk (clk),
.rst_n (rst_n),
.s_axis_tdata (mm2mm_rdata),
.s_axis_tlast (mm2mm_rlast),
.s_axis_tuser (1'b0),
.s_axis_tvalid (mm2mm_rvalid && (state == READ)),
.s_axis_tready (fifo_s_tready),
.m_axis_tdata (fifo_tdata),
.m_axis_tlast (fifo_tlast),
.m_axis_tuser (fifo_tuser),
.m_axis_tvalid (fifo_tvalid),
.m_axis_tready (mm2mm_wready && (state == WRITE))
);
// -------------------------------------------------------------------------
// AXI output assignments — AR channel
// -------------------------------------------------------------------------
assign mm2mm_arvalid = (state == AR);
assign mm2mm_araddr = src_axi_addr;
assign mm2mm_arlen = next_arlen;
assign mm2mm_arsize = next_size;
assign mm2mm_arburst = 2'b01;
assign mm2mm_arlock = 1'b0;
assign mm2mm_arcache = 4'b0;
assign mm2mm_arprot = 3'b0;
assign mm2mm_arqos = 4'b0;
assign mm2mm_arid = '0;
assign mm2mm_aruser = '0;
// -------------------------------------------------------------------------
// AXI output assignments — AW channel
// -------------------------------------------------------------------------
assign mm2mm_awvalid = (state == AW);
assign mm2mm_awaddr = dst_axi_addr;
assign mm2mm_awlen = next_awlen;
assign mm2mm_awsize = next_size;
assign mm2mm_awburst = 2'b01;
assign mm2mm_awlock = 1'b0;
assign mm2mm_awcache = 4'b0;
assign mm2mm_awprot = 3'b0;
assign mm2mm_awqos = 4'b0;
assign mm2mm_awid = '0;
assign mm2mm_awuser = '0;
// -------------------------------------------------------------------------
// AXI output assignments — W channel
// -------------------------------------------------------------------------
generate
for (genvar i = 0; i < STRB_WIDTH; i++) begin : gen_wdata_mask
assign mm2mm_wdata[i*8 +: 8] = wstrb_mask[i] ? fifo_tdata[i*8 +: 8] : 8'h00;
end
endgenerate
assign mm2mm_wvalid = (state == WRITE) && fifo_tvalid;
assign mm2mm_wlast = (beat_cnt == next_awlen) && mm2mm_wvalid;
assign mm2mm_wstrb = wstrb_mask;
assign mm2mm_wuser = '0;
// -------------------------------------------------------------------------
// AXI output assignments — B channel
// -------------------------------------------------------------------------
assign mm2mm_bready = (state == WAIT_BRESP);
// -------------------------------------------------------------------------
// Functions (identical to s2mm / mm2s)
// -------------------------------------------------------------------------
// Returns total byte count for an AXI burst: (len+1) << size.
function automatic [14:0] compute_num_bytes(
input logic [7:0] len,
input logic [2:0] size
);
case (size)
3'b000: compute_num_bytes = {7'b0, len + 1'b1};
3'b001: compute_num_bytes = {6'b0, len + 1'b1, 1'b0};
3'b010: compute_num_bytes = {5'b0, len + 1'b1, 2'b0};
3'b011: compute_num_bytes = {4'b0, len + 1'b1, 3'b0};
3'b100: compute_num_bytes = {3'b0, len + 1'b1, 4'b0};
3'b101: compute_num_bytes = {2'b0, len + 1'b1, 5'b0};
3'b110: compute_num_bytes = {1'b0, len + 1'b1, 6'b0};
3'b111: compute_num_bytes = { len + 1'b1, 7'b0};
endcase
endfunction
// Returns arlen/awlen = ceil(bytes / beat_size) - 1.
function automatic [7:0] compute_next_len(
input logic [14:0] bytes_i,
input logic [2:0] size
);
logic [14:0] num_beats;
case (size)
3'b000: num_beats = bytes_i;
3'b001: num_beats = (bytes_i + 15'd1) >> 1;
3'b010: num_beats = (bytes_i + 15'd3) >> 2;
3'b011: num_beats = (bytes_i + 15'd7) >> 3;
3'b100: num_beats = (bytes_i + 15'd15) >> 4;
3'b101: num_beats = (bytes_i + 15'd31) >> 5;
3'b110: num_beats = (bytes_i + 15'd63) >> 6;
3'b111: num_beats = (bytes_i + 15'd127) >> 7;
endcase
if (num_beats == 15'd0)
compute_next_len = 8'd0;
else
compute_next_len = num_beats[7:0] - 8'd1;
endfunction
endmodule : snix_axi_mm2mm