376 lines
13 KiB
Verilog
376 lines
13 KiB
Verilog
`timescale 1ns / 1ps
|
|
|
|
module tb_matrix_multiplication;
|
|
|
|
// Parameters
|
|
parameter DATA_WIDTH = 16;
|
|
parameter MEM_ROWS = 20;//20 ->5bits //16
|
|
parameter MEM_COLS = 80;//80 ->7bits //32SS
|
|
parameter PE_ROWS = 20;//
|
|
parameter PE_COLS = 20;//10;//
|
|
parameter COMMON_ROW_COL = 4;
|
|
parameter OUTPUT_COL = 20;
|
|
parameter OUTPUT_ROW = 20;
|
|
// Clock and reset
|
|
reg clk;
|
|
reg rst;
|
|
|
|
// Control
|
|
reg enable;
|
|
reg [1:0] mode;
|
|
|
|
// Inputs for matrix A and B
|
|
reg [DATA_WIDTH-1:0] data_input_A;
|
|
reg [DATA_WIDTH-1:0] data_input_B;
|
|
reg valid_mem_input_A;
|
|
reg valid_mem_input_B;
|
|
|
|
// Address offset configuration
|
|
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_A;
|
|
reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_A;
|
|
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_B;
|
|
reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_B;
|
|
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_writing;
|
|
reg [$clog2(MEM_COLS)-1:0] cols_start_add_writing;
|
|
|
|
// Size configuration
|
|
reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_A;
|
|
reg [$clog2(MEM_COLS)-1:0] cols_size_reading_A;
|
|
reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_B;
|
|
reg [$clog2(MEM_COLS)-1:0] cols_size_reading_B;
|
|
|
|
// Outputs
|
|
wire done;
|
|
|
|
wire [$clog2(MEM_ROWS)-1:0] row_addr_A;
|
|
wire [$clog2(MEM_COLS)-1:0] col_addr_A;
|
|
wire [$clog2(MEM_ROWS)-1:0] row_addr_B;
|
|
wire [$clog2(MEM_COLS)-1:0] col_addr_B;
|
|
wire [$clog2(MEM_ROWS)-1:0] row_addr_out;
|
|
wire [$clog2(MEM_COLS)-1:0] col_addr_out;
|
|
|
|
wire read_enable_A;
|
|
wire read_enable_B;
|
|
wire write_enable_out;
|
|
wire read_full_row_A, read_full_row_B,write_full_row_1, write_full_row_2, write_full_row_3;
|
|
reg read_full_row_3;
|
|
|
|
wire [DATA_WIDTH-1:0] data_out;
|
|
|
|
// Unused memory interfaces
|
|
reg [$clog2(MEM_ROWS)-1:0] row_addr_1, row_addr_2, row_addr_3, no_rows_used3; //reg
|
|
reg [$clog2(MEM_COLS)-1:0] col_addr_1, col_addr_2, col_addr_3, no_cols_used3;
|
|
wire [DATA_WIDTH-1:0] data_out_1, data_out_2, data_in_3;
|
|
reg [DATA_WIDTH-1:0] data_out_3;
|
|
wire valid_1, valid_2, valid_3;
|
|
reg write_enable_3, read_enable_3;//reg is instead of wire so that multiple inputs can be driven through them.
|
|
wire [DATA_WIDTH*((MEM_ROWS>MEM_COLS)?MEM_ROWS-1:MEM_COLS-1):0] full_row_output_1, full_row_output_2, full_row_output_3, full_row_input_1, full_row_input_2, full_row_input_3; ///
|
|
reg [DATA_WIDTH*((MEM_ROWS>MEM_COLS)?MEM_ROWS-1:MEM_COLS-1):0] full_row_A, full_row_B;//reg
|
|
reg read_full_row_or_col_from_mem;
|
|
|
|
integer cycle_count;
|
|
reg write_back_to_file_enable; //writing enable the file back into the memory
|
|
wire done_writing_to_file; ////writing enable the file back into the memory
|
|
// Instantiate the memory module
|
|
top_module_mem #(
|
|
.ROWS1(MEM_ROWS),
|
|
.COLS1(MEM_COLS),
|
|
.ROWS2(MEM_ROWS),
|
|
.COLS2(MEM_COLS),
|
|
.ROWS3(MEM_ROWS),
|
|
.COLS3(MEM_COLS),
|
|
.DATA_WIDTH(DATA_WIDTH),
|
|
.COLS_USED(COMMON_ROW_COL)
|
|
) memory_inst (
|
|
.clk(clk),
|
|
|
|
// Memory 1 (not used)
|
|
.row_addr_1(row_addr_1),
|
|
.col_addr_1(col_addr_1),
|
|
.write_enable_1(1'b0),
|
|
.read_enable_1(1'b0),
|
|
.data_input_1(16'd0),
|
|
.data_output_1(data_out_1),
|
|
.valid_1(valid_1),
|
|
.read_full_row_or_col1(1'b0),
|
|
.read_full_row_1(1'b0),
|
|
.no_cols_used1(),
|
|
.no_rows_used1(),
|
|
.full_row_output_1(full_row_output_1),
|
|
.full_row_input_1(full_row_input_1),////
|
|
.write_full_row_1(write_full_row_1),///
|
|
|
|
// Memory 2 (not used)
|
|
.row_addr_2(row_addr_2),
|
|
.col_addr_2(col_addr_2),
|
|
.write_enable_2(1'b0),
|
|
.read_enable_2(1'b0),
|
|
.data_input_2(16'd0),
|
|
.data_output_2(data_out_2),
|
|
.valid_2(valid_2),
|
|
.read_full_row_or_col2(1'b0),
|
|
.read_full_row_2(1'b0),
|
|
.no_cols_used2(),
|
|
.no_rows_used2(),
|
|
.full_row_output_2(full_row_output_2),
|
|
.full_row_input_2(full_row_input_2),////
|
|
.write_full_row_2(write_full_row_2),///
|
|
|
|
|
|
|
|
// Memory 3 (used for matrix multiplication)
|
|
.row_addr_3(row_addr_3),
|
|
.col_addr_3(col_addr_3),
|
|
.write_enable_3(write_enable_3),
|
|
.read_enable_3(read_enable_3),
|
|
.data_input_3(data_out_3),
|
|
.data_output_3(data_in_3),
|
|
.valid_3(valid_3),
|
|
.read_full_row_or_col3(read_full_row_or_col_from_mem),/// to read entire column
|
|
.read_full_row_3(read_full_row_3),
|
|
.no_cols_used3(no_cols_used3),///
|
|
.no_rows_used3(no_rows_used3),///
|
|
.full_row_output_3(full_row_output_3),
|
|
.full_row_input_3(full_row_input_3),////
|
|
.write_full_row_3(write_full_row_3),
|
|
.write_back_to_file_enable(write_back_to_file_enable),
|
|
.done_writing_to_file(done_writing_to_file)
|
|
);
|
|
|
|
// Instantiate the matrix multiplication unit
|
|
matrix_multiplication_unit_new #(
|
|
.DATA_WIDTH(DATA_WIDTH),
|
|
.MEM_ROWS(MEM_ROWS),
|
|
.MEM_COLS(MEM_COLS),
|
|
.PE_ROWS(PE_ROWS),
|
|
.PE_COLS(PE_COLS),
|
|
.COMMON_ROW_COL(COMMON_ROW_COL),
|
|
.OUTPUT_COL(OUTPUT_COL),
|
|
.OUTPUT_ROW(OUTPUT_ROW)
|
|
) mmu_inst (
|
|
.clk(clk),
|
|
.rst(rst),
|
|
.enable(enable),
|
|
.mode(mode),
|
|
|
|
.data_input_A(data_input_A),
|
|
.data_input_B(data_input_B),
|
|
.valid_mem_input_A(valid_mem_input_A),
|
|
.valid_mem_input_B(valid_mem_input_B),
|
|
|
|
.rows_start_add_reading_A(rows_start_add_reading_A),
|
|
.cols_start_add_reading_A(cols_start_add_reading_A),
|
|
.rows_start_add_reading_B(rows_start_add_reading_B),
|
|
.cols_start_add_reading_B(cols_start_add_reading_B),
|
|
.rows_start_add_writing(rows_start_add_writing),
|
|
.cols_start_add_writing(cols_start_add_writing),
|
|
|
|
.rows_size_reading_A(rows_size_reading_A),
|
|
.cols_size_reading_A(cols_size_reading_A),
|
|
.rows_size_reading_B(rows_size_reading_B),
|
|
.cols_size_reading_B(cols_size_reading_B),
|
|
|
|
//outputs
|
|
.done(done),
|
|
.row_addr_A(row_addr_A),
|
|
.col_addr_A(col_addr_A),
|
|
.row_addr_B(row_addr_B),
|
|
.col_addr_B(col_addr_B),
|
|
.row_addr_out(row_addr_out),
|
|
.col_addr_out(col_addr_out),
|
|
|
|
.read_enable_A(read_enable_A),
|
|
.read_enable_B(read_enable_B),
|
|
.write_enable_out(write_enable_out),
|
|
.data_out(data_out),
|
|
.full_row_A(full_row_A),
|
|
.full_row_B(full_row_B),
|
|
.read_full_row_A(read_full_row_A),
|
|
.read_full_row_B(read_full_row_B),
|
|
.write_full_row_out(write_full_row_3),
|
|
.Full_row_out(full_row_input_3)
|
|
|
|
);
|
|
|
|
// Clock Generation
|
|
initial begin
|
|
clk = 0;
|
|
forever #5 clk = ~clk; // 100MHz clock
|
|
end
|
|
|
|
always @(*) begin
|
|
if (read_full_row_A) begin
|
|
row_addr_3 = row_addr_A;
|
|
col_addr_3 = col_addr_A;
|
|
full_row_A = full_row_output_3;//<= changed to =
|
|
no_rows_used3 = rows_size_reading_A;
|
|
no_cols_used3 = cols_size_reading_A;
|
|
valid_mem_input_A = valid_3;//change this logic !!!!!!!!!!!!!
|
|
if (valid_3)
|
|
full_row_A = full_row_output_3;//<= changed to =
|
|
|
|
end else if (read_full_row_B) begin
|
|
row_addr_3 = row_addr_B;
|
|
col_addr_3 = col_addr_B;
|
|
full_row_B = full_row_output_3;//<= changed to =
|
|
no_rows_used3 = cols_size_reading_B;//because we have transporsed and saved it on the memory
|
|
no_cols_used3 = rows_size_reading_B;
|
|
valid_mem_input_B = valid_3;//change this logic !!!!!!!!!!!!!
|
|
if (valid_3)
|
|
full_row_B = full_row_output_3;//<= changed to =
|
|
|
|
end else if (write_full_row_3)begin
|
|
row_addr_3 = row_addr_out;
|
|
col_addr_3 = col_addr_out;
|
|
no_rows_used3 = rows_size_reading_A;
|
|
no_cols_used3 = cols_size_reading_B;
|
|
end
|
|
if ((write_full_row_3 == 1) && (cols_size_reading_B >= rows_size_reading_A)) begin
|
|
read_full_row_or_col_from_mem <= 1'b1;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
|
|
// read_full_row_or_col_from_mem <= 1'b0;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
|
|
end
|
|
if (valid_3 == 0) begin
|
|
valid_mem_input_A = valid_3;//change this logic !!!!!!!!!!!!!
|
|
valid_mem_input_B = valid_3;//change this logic !!!!!!!!!!!!!
|
|
|
|
end
|
|
|
|
read_full_row_3 = read_full_row_A | read_full_row_B;
|
|
write_enable_3 = write_enable_out;
|
|
data_out_3 = data_out;
|
|
|
|
end
|
|
|
|
// Cycle counting
|
|
always @(posedge clk) begin
|
|
if (rst) begin
|
|
cycle_count <= 0;
|
|
end else if (enable && !done) begin
|
|
cycle_count <= cycle_count + 1;
|
|
end
|
|
end
|
|
|
|
|
|
|
|
|
|
// Test Sequence
|
|
initial begin
|
|
// Initialize control signals
|
|
full_row_A = 0;//<= changed to =
|
|
full_row_B = 0;//<= changed to =
|
|
rst = 1;
|
|
enable = 0;
|
|
//mode = 2'b01; // input-Stationary mode
|
|
// mode = 2'b10; // weight-Stationary mode
|
|
mode = 2'b00; // Output-Stationary mode
|
|
|
|
// Wait for a few clock cycles
|
|
#20;
|
|
rst = 0;
|
|
|
|
// Wait for reset deassertion
|
|
#20;
|
|
|
|
read_full_row_or_col_from_mem <= 1'b1;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
|
|
// read_full_row_or_col_from_mem <= 1'b1;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
|
|
rows_start_add_reading_A <= 5'b0;
|
|
cols_start_add_reading_A <= 7'b0;
|
|
rows_start_add_reading_B <= 5'b0;
|
|
cols_start_add_reading_B <= 7'd4;//4 to 7
|
|
rows_start_add_writing <= 5'b0;
|
|
cols_start_add_writing <= 7'd12;//12 to 21
|
|
|
|
rows_size_reading_A <= 5'd15;//5'd19;//5'd9;//A-> 20X4
|
|
cols_size_reading_A <= COMMON_ROW_COL-1;
|
|
rows_size_reading_B <= COMMON_ROW_COL-1;
|
|
cols_size_reading_B <= 7'd19;//7'd9;//7'd4; //(B-> 10X4)^T
|
|
|
|
|
|
// Enable the matrix multiplication
|
|
enable = 1;
|
|
|
|
// Wait 20 ns
|
|
#20;
|
|
|
|
// Wait for the operation to complete
|
|
wait (done);
|
|
|
|
// Print number of cycles taken
|
|
$display("Operation completed in %0d cycles(1/2)", cycle_count); //1261 cycles-> old o/p sationary implemenetaion
|
|
|
|
|
|
// Disable the enable signal
|
|
enable = 0;
|
|
//enable writing signal for memory dump
|
|
write_back_to_file_enable = 1;
|
|
// Wait a few cycles to observe
|
|
#20;
|
|
|
|
wait(done_writing_to_file);
|
|
#20
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/* // Initialize control signals
|
|
rst = 1;
|
|
enable = 0;
|
|
//mode = 2'b01; // input-Stationary mode
|
|
// mode = 2'b10; // weight-Stationary mode
|
|
mode = 2'b00; // Output-Stationary mode
|
|
|
|
// Wait for a few clock cycles
|
|
#20;
|
|
rst = 0;
|
|
|
|
// Wait for reset deassertion
|
|
#20;
|
|
//assigning register properly
|
|
//inputs to hdpe
|
|
// data_input_A <= data_in_3;
|
|
// data_input_B <= data_in_3;
|
|
// valid_mem_input_A <= valid_3;
|
|
// valid_mem_input_B <= valid_3;
|
|
rows_start_add_reading_A <= 5'b0;
|
|
cols_start_add_reading_A <= 7'd20;
|
|
rows_start_add_reading_B <= 5'b0;
|
|
cols_start_add_reading_B <= 7'd7;//4 to 7
|
|
rows_start_add_writing <= 5'b0;
|
|
cols_start_add_writing <= 7'd35;//12 to 21
|
|
|
|
rows_size_reading_A <= 5'd19;//A-> 20X4
|
|
cols_size_reading_A <= COMMON_ROW_COL;
|
|
rows_size_reading_B <= COMMON_ROW_COL;
|
|
cols_size_reading_B <= 7'd9; //(B-> 10X4)^T
|
|
//outputs to hdpe
|
|
// row_addr_3 <= row_addr_A;
|
|
// col_addr_3 <= col_addr_A;
|
|
// row_addr_3 <= row_addr_B;
|
|
// col_addr_3 <= col_addr_B;
|
|
// row_addr_3 <= row_addr_out;
|
|
// col_addr_3 <= row_addr_out;
|
|
// read_enable_3 <= read_enable_A;
|
|
// read_enable_3 <= read_enable_B;
|
|
// write_enable_3 <= write_enable_out;
|
|
// data_in_3 <= data_out;
|
|
|
|
// Enable the matrix multiplication
|
|
enable = 1;
|
|
|
|
// Wait 20 ns
|
|
#20;
|
|
|
|
// Wait for the operation to complete
|
|
wait (done);
|
|
|
|
// Print number of cycles taken
|
|
$display("Operation completed in %0d cycles(2/2)", cycle_count); //1261 cycles-> old o/p sationary implemenetaion
|
|
|
|
|
|
// Disable the enable signal
|
|
enable = 0;
|
|
|
|
// Wait a few cycles to observe
|
|
#20;
|
|
*/ // Finish the simulation
|
|
$stop;
|
|
end
|
|
|
|
endmodule
|