AYAKA_Transformer/rtl/hdpe_unit_tb.v

376 lines
13 KiB
Verilog

`timescale 1ns / 1ps
module tb_matrix_multiplication;
// Parameters
parameter DATA_WIDTH = 16;
parameter MEM_ROWS = 20;//20 ->5bits //16
parameter MEM_COLS = 80;//80 ->7bits //32SS
parameter PE_ROWS = 20;//
parameter PE_COLS = 20;//10;//
parameter COMMON_ROW_COL = 4;
parameter OUTPUT_COL = 20;
parameter OUTPUT_ROW = 20;
// Clock and reset
reg clk;
reg rst;
// Control
reg enable;
reg [1:0] mode;
// Inputs for matrix A and B
reg [DATA_WIDTH-1:0] data_input_A;
reg [DATA_WIDTH-1:0] data_input_B;
reg valid_mem_input_A;
reg valid_mem_input_B;
// Address offset configuration
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_A;
reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_A;
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_B;
reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_B;
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_writing;
reg [$clog2(MEM_COLS)-1:0] cols_start_add_writing;
// Size configuration
reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_A;
reg [$clog2(MEM_COLS)-1:0] cols_size_reading_A;
reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_B;
reg [$clog2(MEM_COLS)-1:0] cols_size_reading_B;
// Outputs
wire done;
wire [$clog2(MEM_ROWS)-1:0] row_addr_A;
wire [$clog2(MEM_COLS)-1:0] col_addr_A;
wire [$clog2(MEM_ROWS)-1:0] row_addr_B;
wire [$clog2(MEM_COLS)-1:0] col_addr_B;
wire [$clog2(MEM_ROWS)-1:0] row_addr_out;
wire [$clog2(MEM_COLS)-1:0] col_addr_out;
wire read_enable_A;
wire read_enable_B;
wire write_enable_out;
wire read_full_row_A, read_full_row_B,write_full_row_1, write_full_row_2, write_full_row_3;
reg read_full_row_3;
wire [DATA_WIDTH-1:0] data_out;
// Unused memory interfaces
reg [$clog2(MEM_ROWS)-1:0] row_addr_1, row_addr_2, row_addr_3, no_rows_used3; //reg
reg [$clog2(MEM_COLS)-1:0] col_addr_1, col_addr_2, col_addr_3, no_cols_used3;
wire [DATA_WIDTH-1:0] data_out_1, data_out_2, data_in_3;
reg [DATA_WIDTH-1:0] data_out_3;
wire valid_1, valid_2, valid_3;
reg write_enable_3, read_enable_3;//reg is instead of wire so that multiple inputs can be driven through them.
wire [DATA_WIDTH*((MEM_ROWS>MEM_COLS)?MEM_ROWS-1:MEM_COLS-1):0] full_row_output_1, full_row_output_2, full_row_output_3, full_row_input_1, full_row_input_2, full_row_input_3; ///
reg [DATA_WIDTH*((MEM_ROWS>MEM_COLS)?MEM_ROWS-1:MEM_COLS-1):0] full_row_A, full_row_B;//reg
reg read_full_row_or_col_from_mem;
integer cycle_count;
reg write_back_to_file_enable; //writing enable the file back into the memory
wire done_writing_to_file; ////writing enable the file back into the memory
// Instantiate the memory module
top_module_mem #(
.ROWS1(MEM_ROWS),
.COLS1(MEM_COLS),
.ROWS2(MEM_ROWS),
.COLS2(MEM_COLS),
.ROWS3(MEM_ROWS),
.COLS3(MEM_COLS),
.DATA_WIDTH(DATA_WIDTH),
.COLS_USED(COMMON_ROW_COL)
) memory_inst (
.clk(clk),
// Memory 1 (not used)
.row_addr_1(row_addr_1),
.col_addr_1(col_addr_1),
.write_enable_1(1'b0),
.read_enable_1(1'b0),
.data_input_1(16'd0),
.data_output_1(data_out_1),
.valid_1(valid_1),
.read_full_row_or_col1(1'b0),
.read_full_row_1(1'b0),
.no_cols_used1(),
.no_rows_used1(),
.full_row_output_1(full_row_output_1),
.full_row_input_1(full_row_input_1),////
.write_full_row_1(write_full_row_1),///
// Memory 2 (not used)
.row_addr_2(row_addr_2),
.col_addr_2(col_addr_2),
.write_enable_2(1'b0),
.read_enable_2(1'b0),
.data_input_2(16'd0),
.data_output_2(data_out_2),
.valid_2(valid_2),
.read_full_row_or_col2(1'b0),
.read_full_row_2(1'b0),
.no_cols_used2(),
.no_rows_used2(),
.full_row_output_2(full_row_output_2),
.full_row_input_2(full_row_input_2),////
.write_full_row_2(write_full_row_2),///
// Memory 3 (used for matrix multiplication)
.row_addr_3(row_addr_3),
.col_addr_3(col_addr_3),
.write_enable_3(write_enable_3),
.read_enable_3(read_enable_3),
.data_input_3(data_out_3),
.data_output_3(data_in_3),
.valid_3(valid_3),
.read_full_row_or_col3(read_full_row_or_col_from_mem),/// to read entire column
.read_full_row_3(read_full_row_3),
.no_cols_used3(no_cols_used3),///
.no_rows_used3(no_rows_used3),///
.full_row_output_3(full_row_output_3),
.full_row_input_3(full_row_input_3),////
.write_full_row_3(write_full_row_3),
.write_back_to_file_enable(write_back_to_file_enable),
.done_writing_to_file(done_writing_to_file)
);
// Instantiate the matrix multiplication unit
matrix_multiplication_unit_new #(
.DATA_WIDTH(DATA_WIDTH),
.MEM_ROWS(MEM_ROWS),
.MEM_COLS(MEM_COLS),
.PE_ROWS(PE_ROWS),
.PE_COLS(PE_COLS),
.COMMON_ROW_COL(COMMON_ROW_COL),
.OUTPUT_COL(OUTPUT_COL),
.OUTPUT_ROW(OUTPUT_ROW)
) mmu_inst (
.clk(clk),
.rst(rst),
.enable(enable),
.mode(mode),
.data_input_A(data_input_A),
.data_input_B(data_input_B),
.valid_mem_input_A(valid_mem_input_A),
.valid_mem_input_B(valid_mem_input_B),
.rows_start_add_reading_A(rows_start_add_reading_A),
.cols_start_add_reading_A(cols_start_add_reading_A),
.rows_start_add_reading_B(rows_start_add_reading_B),
.cols_start_add_reading_B(cols_start_add_reading_B),
.rows_start_add_writing(rows_start_add_writing),
.cols_start_add_writing(cols_start_add_writing),
.rows_size_reading_A(rows_size_reading_A),
.cols_size_reading_A(cols_size_reading_A),
.rows_size_reading_B(rows_size_reading_B),
.cols_size_reading_B(cols_size_reading_B),
//outputs
.done(done),
.row_addr_A(row_addr_A),
.col_addr_A(col_addr_A),
.row_addr_B(row_addr_B),
.col_addr_B(col_addr_B),
.row_addr_out(row_addr_out),
.col_addr_out(col_addr_out),
.read_enable_A(read_enable_A),
.read_enable_B(read_enable_B),
.write_enable_out(write_enable_out),
.data_out(data_out),
.full_row_A(full_row_A),
.full_row_B(full_row_B),
.read_full_row_A(read_full_row_A),
.read_full_row_B(read_full_row_B),
.write_full_row_out(write_full_row_3),
.Full_row_out(full_row_input_3)
);
// Clock Generation
initial begin
clk = 0;
forever #5 clk = ~clk; // 100MHz clock
end
always @(*) begin
if (read_full_row_A) begin
row_addr_3 = row_addr_A;
col_addr_3 = col_addr_A;
full_row_A = full_row_output_3;//<= changed to =
no_rows_used3 = rows_size_reading_A;
no_cols_used3 = cols_size_reading_A;
valid_mem_input_A = valid_3;//change this logic !!!!!!!!!!!!!
if (valid_3)
full_row_A = full_row_output_3;//<= changed to =
end else if (read_full_row_B) begin
row_addr_3 = row_addr_B;
col_addr_3 = col_addr_B;
full_row_B = full_row_output_3;//<= changed to =
no_rows_used3 = cols_size_reading_B;//because we have transporsed and saved it on the memory
no_cols_used3 = rows_size_reading_B;
valid_mem_input_B = valid_3;//change this logic !!!!!!!!!!!!!
if (valid_3)
full_row_B = full_row_output_3;//<= changed to =
end else if (write_full_row_3)begin
row_addr_3 = row_addr_out;
col_addr_3 = col_addr_out;
no_rows_used3 = rows_size_reading_A;
no_cols_used3 = cols_size_reading_B;
end
if ((write_full_row_3 == 1) && (cols_size_reading_B >= rows_size_reading_A)) begin
read_full_row_or_col_from_mem <= 1'b1;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
// read_full_row_or_col_from_mem <= 1'b0;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
end
if (valid_3 == 0) begin
valid_mem_input_A = valid_3;//change this logic !!!!!!!!!!!!!
valid_mem_input_B = valid_3;//change this logic !!!!!!!!!!!!!
end
read_full_row_3 = read_full_row_A | read_full_row_B;
write_enable_3 = write_enable_out;
data_out_3 = data_out;
end
// Cycle counting
always @(posedge clk) begin
if (rst) begin
cycle_count <= 0;
end else if (enable && !done) begin
cycle_count <= cycle_count + 1;
end
end
// Test Sequence
initial begin
// Initialize control signals
full_row_A = 0;//<= changed to =
full_row_B = 0;//<= changed to =
rst = 1;
enable = 0;
//mode = 2'b01; // input-Stationary mode
// mode = 2'b10; // weight-Stationary mode
mode = 2'b00; // Output-Stationary mode
// Wait for a few clock cycles
#20;
rst = 0;
// Wait for reset deassertion
#20;
read_full_row_or_col_from_mem <= 1'b1;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
// read_full_row_or_col_from_mem <= 1'b1;//working1'b1; ////0 to read row wise & 1 to read col wise (default)
rows_start_add_reading_A <= 5'b0;
cols_start_add_reading_A <= 7'b0;
rows_start_add_reading_B <= 5'b0;
cols_start_add_reading_B <= 7'd4;//4 to 7
rows_start_add_writing <= 5'b0;
cols_start_add_writing <= 7'd12;//12 to 21
rows_size_reading_A <= 5'd15;//5'd19;//5'd9;//A-> 20X4
cols_size_reading_A <= COMMON_ROW_COL-1;
rows_size_reading_B <= COMMON_ROW_COL-1;
cols_size_reading_B <= 7'd19;//7'd9;//7'd4; //(B-> 10X4)^T
// Enable the matrix multiplication
enable = 1;
// Wait 20 ns
#20;
// Wait for the operation to complete
wait (done);
// Print number of cycles taken
$display("Operation completed in %0d cycles(1/2)", cycle_count); //1261 cycles-> old o/p sationary implemenetaion
// Disable the enable signal
enable = 0;
//enable writing signal for memory dump
write_back_to_file_enable = 1;
// Wait a few cycles to observe
#20;
wait(done_writing_to_file);
#20
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
/* // Initialize control signals
rst = 1;
enable = 0;
//mode = 2'b01; // input-Stationary mode
// mode = 2'b10; // weight-Stationary mode
mode = 2'b00; // Output-Stationary mode
// Wait for a few clock cycles
#20;
rst = 0;
// Wait for reset deassertion
#20;
//assigning register properly
//inputs to hdpe
// data_input_A <= data_in_3;
// data_input_B <= data_in_3;
// valid_mem_input_A <= valid_3;
// valid_mem_input_B <= valid_3;
rows_start_add_reading_A <= 5'b0;
cols_start_add_reading_A <= 7'd20;
rows_start_add_reading_B <= 5'b0;
cols_start_add_reading_B <= 7'd7;//4 to 7
rows_start_add_writing <= 5'b0;
cols_start_add_writing <= 7'd35;//12 to 21
rows_size_reading_A <= 5'd19;//A-> 20X4
cols_size_reading_A <= COMMON_ROW_COL;
rows_size_reading_B <= COMMON_ROW_COL;
cols_size_reading_B <= 7'd9; //(B-> 10X4)^T
//outputs to hdpe
// row_addr_3 <= row_addr_A;
// col_addr_3 <= col_addr_A;
// row_addr_3 <= row_addr_B;
// col_addr_3 <= col_addr_B;
// row_addr_3 <= row_addr_out;
// col_addr_3 <= row_addr_out;
// read_enable_3 <= read_enable_A;
// read_enable_3 <= read_enable_B;
// write_enable_3 <= write_enable_out;
// data_in_3 <= data_out;
// Enable the matrix multiplication
enable = 1;
// Wait 20 ns
#20;
// Wait for the operation to complete
wait (done);
// Print number of cycles taken
$display("Operation completed in %0d cycles(2/2)", cycle_count); //1261 cycles-> old o/p sationary implemenetaion
// Disable the enable signal
enable = 0;
// Wait a few cycles to observe
#20;
*/ // Finish the simulation
$stop;
end
endmodule