Files
ciciec2026_loongson/rtl/ip/cdma/snix_axil_cdma_mux.sv

378 lines
16 KiB
Systemverilog

// ============================================================================
// snix_axil_cdma_mux.sv
// Multi-Channel AXI-Lite MUX Wrapper for snix_axi_mm2mm engine
//
// Register Map (per channel, offset 0x40):
// 0x00: READ_ADDR (Source Address)
// 0x04: WRITE_ADDR (Destination Address)
// 0x08: LENGTH (Transfer length in bytes)
// 0x0C: TAG (User tag, purely for software tracking)
// 0x10: CTRL (Write 1 to bit 0 to trigger)
// [5:3] = AXI AxSIZE (0=1B, 1=2B, 2=4B, 3=8B...)
// [13:6] = AXI AxLEN (0=1 beat, 15=16 beats...)
// 0x14: STATUS (Bit 0: Busy RO, Bit 1: Done W1C)
// ============================================================================
`timescale 1ns / 1ps
module snix_axil_cdma_mux #(
parameter int ADDR_WIDTH = 32,
parameter int DATA_WIDTH = 32, // Matches engine default
parameter int AXIL_ADDR_WIDTH = 32,
parameter int AXIL_DATA_WIDTH = 32,
parameter int ID_WIDTH = 4,
parameter int USER_WIDTH = 1,
parameter int PORTS = 8,
parameter int FIFO_DEPTH = 16
) (
input logic clk,
input logic rst_n,
// ==========================================
// AXI-Lite Slave Interface (CPU CSR Access)
// ==========================================
input logic [AXIL_ADDR_WIDTH-1:0] s_axil_awaddr,
input logic s_axil_awvalid,
output logic s_axil_awready,
input logic [AXIL_DATA_WIDTH-1:0] s_axil_wdata,
input logic [AXIL_DATA_WIDTH/8-1:0] s_axil_wstrb,
input logic s_axil_wvalid,
output logic s_axil_wready,
output logic [1:0] s_axil_bresp,
output logic s_axil_bvalid,
input logic s_axil_bready,
input logic [AXIL_ADDR_WIDTH-1:0] s_axil_araddr,
input logic s_axil_arvalid,
output logic s_axil_arready,
output logic [AXIL_DATA_WIDTH-1:0] s_axil_rdata,
output logic [1:0] s_axil_rresp,
output logic s_axil_rvalid,
input logic s_axil_rready,
// ==========================================
// AXI4 Master Interface (To Crossbar/Memory)
// ==========================================
output logic [ID_WIDTH-1:0] mm2mm_awid,
output logic [ADDR_WIDTH-1:0] mm2mm_awaddr,
output logic [7:0] mm2mm_awlen,
output logic [2:0] mm2mm_awsize,
output logic [1:0] mm2mm_awburst,
output logic mm2mm_awlock,
output logic [3:0] mm2mm_awcache,
output logic [2:0] mm2mm_awprot,
output logic [3:0] mm2mm_awqos,
output logic [USER_WIDTH-1:0] mm2mm_awuser,
output logic mm2mm_awvalid,
input logic mm2mm_awready,
output logic [DATA_WIDTH-1:0] mm2mm_wdata,
output logic [DATA_WIDTH/8-1:0] mm2mm_wstrb,
output logic mm2mm_wlast,
output logic [USER_WIDTH-1:0] mm2mm_wuser,
output logic mm2mm_wvalid,
input logic mm2mm_wready,
input logic [ID_WIDTH-1:0] mm2mm_bid,
input logic [1:0] mm2mm_bresp,
input logic [USER_WIDTH-1:0] mm2mm_buser,
input logic mm2mm_bvalid,
output logic mm2mm_bready,
output logic [ID_WIDTH-1:0] mm2mm_arid,
output logic [ADDR_WIDTH-1:0] mm2mm_araddr,
output logic [7:0] mm2mm_arlen,
output logic [2:0] mm2mm_arsize,
output logic [1:0] mm2mm_arburst,
output logic mm2mm_arlock,
output logic [3:0] mm2mm_arcache,
output logic [2:0] mm2mm_arprot,
output logic [3:0] mm2mm_arqos,
output logic [USER_WIDTH-1:0] mm2mm_aruser,
output logic mm2mm_arvalid,
input logic mm2mm_arready,
input logic [ID_WIDTH-1:0] mm2mm_rid,
input logic [DATA_WIDTH-1:0] mm2mm_rdata,
input logic [1:0] mm2mm_rresp,
input logic mm2mm_rlast,
input logic [USER_WIDTH-1:0] mm2mm_ruser,
input logic mm2mm_rvalid,
output logic mm2mm_rready,
// Global Interrupt (OR'd from all channels)
output logic dma_finish
);
// ==========================================
// Local Parameters & Utilities
// ==========================================
localparam int CH_BITS = $clog2(PORTS);
// Function to safely apply WSTRB to 32-bit registers
function automatic logic [31:0] apply_wstrb(
input logic [31:0] old_val,
input logic [31:0] new_val,
input logic [3:0] wstrb
);
logic [31:0] res;
res[7:0] = wstrb[0] ? new_val[7:0] : old_val[7:0];
res[15:8] = wstrb[1] ? new_val[15:8] : old_val[15:8];
res[23:16] = wstrb[2] ? new_val[23:16] : old_val[23:16];
res[31:24] = wstrb[3] ? new_val[31:24] : old_val[31:24];
return res;
endfunction
// ==========================================
// Internal Registers (Per Channel)
// ==========================================
logic [ADDR_WIDTH-1:0] ch_src_addr [PORTS];
logic [ADDR_WIDTH-1:0] ch_dst_addr [PORTS];
logic [31:0] ch_len [PORTS];
logic [31:0] ch_tag [PORTS];
logic [31:0] ch_ctrl [PORTS];
logic [PORTS-1:0] ch_req; // Pending requests (Busy)
logic [PORTS-1:0] ch_done; // Completion flags
logic [PORTS-1:0] arb_set_done; // From Arbiter to CSR
// ==========================================
// Address Decoding (0x40 offset per channel)
// ==========================================
wire [CH_BITS-1:0] wr_ch = s_axil_awaddr[6 +: CH_BITS];
wire [5:0] wr_reg = s_axil_awaddr[5:0];
wire [CH_BITS-1:0] rd_ch = s_axil_araddr[6 +: CH_BITS];
wire [5:0] rd_reg = s_axil_araddr[5:0];
// ==========================================
// AXI-Lite Slave Logic (Robust Backpressure)
// ==========================================
assign s_axil_bresp = 2'b00;
assign s_axil_rresp = 2'b00;
// Write Path Handshake
always_ff @(posedge clk) begin
if (!rst_n) begin
s_axil_awready <= 1'b0;
s_axil_wready <= 1'b0;
end else begin
if (s_axil_awvalid && s_axil_wvalid && !s_axil_awready && (!s_axil_bvalid || s_axil_bready)) begin
s_axil_awready <= 1'b1;
s_axil_wready <= 1'b1;
end else begin
s_axil_awready <= 1'b0;
s_axil_wready <= 1'b0;
end
end
end
wire do_write = s_axil_awready && s_axil_awvalid && s_axil_wready && s_axil_wvalid;
always_ff @(posedge clk) begin
if (!rst_n) begin
s_axil_bvalid <= 1'b0;
end else begin
if (do_write) begin
s_axil_bvalid <= 1'b1;
end else if (s_axil_bready && s_axil_bvalid) begin
s_axil_bvalid <= 1'b0;
end
end
end
// Read Path Handshake
always_ff @(posedge clk) begin
if (!rst_n) begin
s_axil_arready <= 1'b0;
end else begin
if (s_axil_arvalid && !s_axil_arready && (!s_axil_rvalid || s_axil_rready)) begin
s_axil_arready <= 1'b1;
end else begin
s_axil_arready <= 1'b0;
end
end
end
wire do_read = s_axil_arready && s_axil_arvalid;
always_ff @(posedge clk) begin
if (!rst_n) begin
s_axil_rvalid <= 1'b0;
s_axil_rdata <= '0;
end else begin
if (do_read) begin
s_axil_rvalid <= 1'b1;
if (rd_ch < PORTS) begin
case (rd_reg)
6'h00: s_axil_rdata <= ch_src_addr[rd_ch];
6'h04: s_axil_rdata <= ch_dst_addr[rd_ch];
6'h08: s_axil_rdata <= ch_len[rd_ch];
6'h0C: s_axil_rdata <= ch_tag[rd_ch];
6'h10: s_axil_rdata <= ch_ctrl[rd_ch];
6'h14: s_axil_rdata <= {30'd0, ch_done[rd_ch], ch_req[rd_ch]};
default: s_axil_rdata <= 32'd0;
endcase
end else begin
s_axil_rdata <= 32'd0; // Out of bounds
end
end else if (s_axil_rready && s_axil_rvalid) begin
s_axil_rvalid <= 1'b0;
end
end
end
// ==========================================
// Register File Write Logic
// ==========================================
assign dma_finish = |ch_done;
always_ff @(posedge clk) begin
if (!rst_n) begin
ch_req <= '0;
ch_done <= '0;
for (int i=0; i<PORTS; i++) begin
ch_src_addr[i] <= '0; ch_dst_addr[i] <= '0;
ch_len[i] <= '0; ch_tag[i] <= '0; ch_ctrl[i] <= '0;
end
end else begin
// 1. Process Hardware Status Updates (Highest Priority for Done flag)
for (int i=0; i<PORTS; i++) begin
if (arb_set_done[i]) begin
ch_req[i] <= 1'b0;
ch_done[i] <= 1'b1;
end
end
// 2. Process CPU Writes
if (do_write && wr_ch < PORTS) begin
case (wr_reg)
6'h00: if (!ch_req[wr_ch]) ch_src_addr[wr_ch] <= apply_wstrb(ch_src_addr[wr_ch], s_axil_wdata, s_axil_wstrb);
6'h04: if (!ch_req[wr_ch]) ch_dst_addr[wr_ch] <= apply_wstrb(ch_dst_addr[wr_ch], s_axil_wdata, s_axil_wstrb);
6'h08: if (!ch_req[wr_ch]) ch_len[wr_ch] <= apply_wstrb(ch_len[wr_ch], s_axil_wdata, s_axil_wstrb);
6'h0C: if (!ch_req[wr_ch]) ch_tag[wr_ch] <= apply_wstrb(ch_tag[wr_ch], s_axil_wdata, s_axil_wstrb);
6'h10: begin
ch_ctrl[wr_ch] <= apply_wstrb(ch_ctrl[wr_ch], s_axil_wdata, s_axil_wstrb);
// Trigger Bit Processing
if (s_axil_wstrb[0] && s_axil_wdata[0]) begin
ch_req[wr_ch] <= 1'b1;
// Clean up done bit automatically upon new start
if (!arb_set_done[wr_ch]) ch_done[wr_ch] <= 1'b0;
end
end
6'h14: begin
// Software W1C for Done flag (Bit 1)
// ONLY clear if hardware is not setting it in the exact same cycle
if (s_axil_wstrb[0] && s_axil_wdata[1]) begin
if (!arb_set_done[wr_ch]) ch_done[wr_ch] <= 1'b0;
end
end
endcase
end
end
end
// ==========================================
// Round-Robin Arbiter & Engine Driver
// ==========================================
typedef enum logic [1:0] {IDLE, RUN} state_t;
state_t state;
logic [CH_BITS-1:0] cur_ch;
logic [CH_BITS-1:0] rr_ptr;
// Interfaces to Engine
logic engine_start;
logic [ADDR_WIDTH-1:0] engine_src;
logic [ADDR_WIDTH-1:0] engine_dst;
logic [31:0] engine_bytes;
logic [7:0] engine_len;
logic [2:0] engine_size;
logic engine_done;
// Dynamic routing to the engine based on current active channel
assign engine_src = ch_src_addr[cur_ch];
assign engine_dst = ch_dst_addr[cur_ch];
assign engine_bytes = ch_len[cur_ch];
assign engine_len = ch_ctrl[cur_ch][13:6];
assign engine_size = ch_ctrl[cur_ch][5:3];
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= IDLE;
engine_start <= 1'b0;
rr_ptr <= '0;
cur_ch <= '0;
arb_set_done <= '0;
end else begin
arb_set_done <= '0;
engine_start <= 1'b0;
case (state)
IDLE: begin
for (int i = 0; i < PORTS; i++) begin
// Calculate next channel safely avoiding modulo operators in loop
logic [CH_BITS:0] check_ch_ext = {1'b0, rr_ptr} + i[CH_BITS:0];
logic [CH_BITS-1:0] check_ch = (check_ch_ext >= PORTS) ? (check_ch_ext - PORTS) : check_ch_ext[CH_BITS-1:0];
if (ch_req[check_ch] && !arb_set_done[check_ch]) begin
cur_ch <= check_ch;
rr_ptr <= (check_ch == (PORTS - 1)) ? '0 : (check_ch + 1);
engine_start <= 1'b1;
state <= RUN;
break;
end
end
end
RUN: begin
if (engine_done) begin
arb_set_done[cur_ch] <= 1'b1;
state <= IDLE;
end
// Optional: Add a watchdog timeout counter here if dealing with untrusted PCIe/AXI endpoints
end
endcase
end
end
// ==========================================
// Instantiate The Original Core Engine
// ==========================================
snix_axi_mm2mm #(
.ADDR_WIDTH(ADDR_WIDTH),
.DATA_WIDTH(DATA_WIDTH),
.ID_WIDTH (ID_WIDTH),
.USER_WIDTH(USER_WIDTH),
.FIFO_DEPTH(FIFO_DEPTH)
) u_core_engine (
.clk (clk),
.rst_n (rst_n),
.ctrl_start (engine_start),
.ctrl_stop (1'b0), // Tied off; can be wired if global abort is needed
.ctrl_src_addr (engine_src),
.ctrl_dst_addr (engine_dst),
.ctrl_len (engine_len),
.ctrl_size (engine_size),
.ctrl_transfer_len (engine_bytes),
.ctrl_done (engine_done),
// AXI4 Port Connections
.mm2mm_awid (mm2mm_awid), .mm2mm_awaddr (mm2mm_awaddr),
.mm2mm_awlen (mm2mm_awlen), .mm2mm_awsize (mm2mm_awsize),
.mm2mm_awburst(mm2mm_awburst),.mm2mm_awlock (mm2mm_awlock),
.mm2mm_awcache(mm2mm_awcache),.mm2mm_awprot (mm2mm_awprot),
.mm2mm_awqos (mm2mm_awqos), .mm2mm_awuser (mm2mm_awuser),
.mm2mm_awvalid(mm2mm_awvalid),.mm2mm_awready(mm2mm_awready),
.mm2mm_wdata (mm2mm_wdata), .mm2mm_wstrb (mm2mm_wstrb),
.mm2mm_wlast (mm2mm_wlast), .mm2mm_wuser (mm2mm_wuser),
.mm2mm_wvalid (mm2mm_wvalid), .mm2mm_wready (mm2mm_wready),
.mm2mm_bid (mm2mm_bid), .mm2mm_bresp (mm2mm_bresp),
.mm2mm_buser (mm2mm_buser), .mm2mm_bvalid (mm2mm_bvalid),
.mm2mm_bready (mm2mm_bready),
.mm2mm_arid (mm2mm_arid), .mm2mm_araddr (mm2mm_araddr),
.mm2mm_arlen (mm2mm_arlen), .mm2mm_arsize (mm2mm_arsize),
.mm2mm_arburst(mm2mm_arburst),.mm2mm_arlock (mm2mm_arlock),
.mm2mm_arcache(mm2mm_arcache),.mm2mm_arprot (mm2mm_arprot),
.mm2mm_arqos (mm2mm_arqos), .mm2mm_aruser (mm2mm_aruser),
.mm2mm_arvalid(mm2mm_arvalid),.mm2mm_arready(mm2mm_arready),
.mm2mm_rid (mm2mm_rid), .mm2mm_rdata (mm2mm_rdata),
.mm2mm_rresp (mm2mm_rresp), .mm2mm_rlast (mm2mm_rlast),
.mm2mm_ruser (mm2mm_ruser), .mm2mm_rvalid (mm2mm_rvalid),
.mm2mm_rready (mm2mm_rready)
);
endmodule