Compare commits

...

2 commits

8 changed files with 311 additions and 218 deletions

View file

@ -13,14 +13,17 @@ module matrix_memory_flexible #(
input write_enable,
input read_enable,
input read_full_row, // 1 = read full row, 0 = read single element
input write_full_row,
input [$clog2(ROWS)-1:0] row,
input [$clog2(COLS)-1:0] col,
// Data ports
input [DATA_WIDTH-1:0] data_in,
input [DATA_WIDTH*ROWS-1:0] full_row_in, //COLS_USED
output reg [DATA_WIDTH-1:0] data_out,
output reg [DATA_WIDTH*ROWS-1:0] full_row_out, //COLS_USED
// Output valid signal
output reg valid
);
@ -48,17 +51,17 @@ module matrix_memory_flexible #(
always @(posedge clk) begin
valid <= 0; // Default invalid unless read
if (write_enable) begin
if (write_full_row) begin
for (i = 0; i < ROWS; i = i + 1) begin //COLS_USED
mem[base_row_addr + i + col] <= full_row_in[i*DATA_WIDTH +: DATA_WIDTH];
///$display("Full Row Input(%d): %0h", i, full_row_out);
end
end else if (write_enable) begin
mem[addr_single] <= data_in;
end
if (read_full_row) begin
else if (read_full_row) begin
for (i = 0; i < ROWS; i = i + 1) begin //COLS_USED
////for (i = col; i < (COLS_USED + col); i = i + 1) begin
//for (i = integer'(col); i < (COLS_USED + integer'(col)); i = i + 1) begin
full_row_out[i*DATA_WIDTH +: DATA_WIDTH] <= mem[base_row_addr + i + col];
//// full_row_out[i*DATA_WIDTH +: DATA_WIDTH] <= mem[base_row_addr + i];
///$display("Full Row Output(%d): %0h", i, full_row_out);
end
valid <= 1;

View file

@ -22,10 +22,12 @@ module top_module_mem #(
input write_enable_1, write_enable_2, write_enable_3,
input read_enable_1, read_enable_2, read_enable_3,
input read_full_row_1, read_full_row_2, read_full_row_3, // NEW signals
input write_full_row_1, write_full_row_2, write_full_row_3, // NEW signals
// Data Inputs
input [DATA_WIDTH-1:0] data_input_1, data_input_2, data_input_3,
input [DATA_WIDTH*ROWS1-1:0] full_row_input_1,
input [DATA_WIDTH*ROWS2-1:0] full_row_input_2,
input [DATA_WIDTH*ROWS3-1:0] full_row_input_3,
// Data Outputs
output [DATA_WIDTH-1:0] data_output_1, data_output_2, data_output_3,
output [DATA_WIDTH*ROWS1-1:0] full_row_output_1, // For full row reads //COLS1
@ -40,12 +42,15 @@ module top_module_mem #(
matrix_memory_flexible #(
.ROWS(ROWS1),
.COLS(COLS1),
.INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_input.hex"),
.INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_input.hex"),
// .INIT_FILE("/mnt/d/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_input.hex"),
.DATA_WIDTH(DATA_WIDTH),
.COLS_USED(COLS_USED)
) u_matrix_mem_1 (
.clk(clk),
.write_enable(write_enable_1),
.write_full_row(write_full_row_1),
.full_row_in(full_row_input_1),
.read_enable(read_enable_1),
.read_full_row(read_full_row_1),
.row(row_addr_1),
@ -61,12 +66,15 @@ module top_module_mem #(
.ROWS(ROWS2),
.COLS(COLS2),
.INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_weight.hex"),
// .INIT_FILE("/mnt/d/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_weight.hex"),
.DATA_WIDTH(DATA_WIDTH),
.COLS_USED(COLS_USED)
) u_matrix_mem_2 (
.clk(clk),
.write_enable(write_enable_2),
.read_enable(read_enable_2),
.write_full_row(write_full_row_2),
.full_row_in(full_row_input_2),
.read_full_row(read_full_row_2),
.row(row_addr_2),
.col(col_addr_2),
@ -80,14 +88,16 @@ module top_module_mem #(
matrix_memory_flexible #(
.ROWS(ROWS3),
.COLS(COLS3),
// .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80.hex"),
.INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80_ones.hex"),
.INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80.hex"),
// .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80_ones.hex"),
.DATA_WIDTH(DATA_WIDTH),
.COLS_USED(COLS_USED)
) u_matrix_mem_3 (
.clk(clk),
.write_enable(write_enable_3),
.read_enable(read_enable_3),
.write_full_row(write_full_row_3),
.full_row_in(full_row_input_3),
.read_full_row(read_full_row_3),
.row(row_addr_3),
.col(col_addr_3),

View file

@ -25,17 +25,20 @@ module tb_top_module_mem;
reg write_enable_1, write_enable_2, write_enable_3;
reg read_enable_1, read_enable_2, read_enable_3;
reg read_full_row_1, read_full_row_2, read_full_row_3;
reg write_full_row_1, write_full_row_2, write_full_row_3;
reg [DATA_WIDTH-1:0] data_input_1, data_input_2, data_input_3;
wire [DATA_WIDTH-1:0] data_output_1, data_output_2, data_output_3;
// wire [DATA_WIDTH*COLS1-1:0] full_row_output_1; //COLS1
// wire [DATA_WIDTH*COLS2-1:0] full_row_output_2; //COLS2
// wire [DATA_WIDTH*COLS3-1:0] full_row_output_3; //COLS3
wire [DATA_WIDTH*ROWS1-1:0] full_row_output_1; //COLS1
wire [DATA_WIDTH*ROWS1-1:0] full_row_output_1; //COLS1
wire [DATA_WIDTH*ROWS2-1:0] full_row_output_2; //COLS2
wire [DATA_WIDTH*ROWS3-1:0] full_row_output_3; //COLS3
reg [DATA_WIDTH*ROWS1-1:0] full_row_input_1; //COLS1
reg [DATA_WIDTH*ROWS2-1:0] full_row_input_2; //COLS2
reg [DATA_WIDTH*ROWS3-1:0] full_row_input_3; //COLS3
wire valid_1, valid_2, valid_3;
integer i;
// Instantiate DUT
top_module_mem #(
.ROWS1(ROWS1),
@ -53,9 +56,11 @@ module tb_top_module_mem;
.row_addr_3(row_addr_3), .col_addr_3(col_addr_3),
.write_enable_1(write_enable_1), .write_enable_2(write_enable_2), .write_enable_3(write_enable_3),
.read_enable_1(read_enable_1), .read_enable_2(read_enable_2), .read_enable_3(read_enable_3),
.write_full_row_1(write_full_row_1), .write_full_row_2(write_full_row_2), .write_full_row_3(write_full_row_3),
.read_full_row_1(read_full_row_1), .read_full_row_2(read_full_row_2), .read_full_row_3(read_full_row_3),
.data_input_1(data_input_1), .data_input_2(data_input_2), .data_input_3(data_input_3),
.data_output_1(data_output_1), .data_output_2(data_output_2), .data_output_3(data_output_3),
.full_row_input_1(full_row_input_1), .full_row_input_2(full_row_input_2), .full_row_input_3(full_row_input_3),
.full_row_output_1(full_row_output_1), .full_row_output_2(full_row_output_2), .full_row_output_3(full_row_output_3),
.valid_1(valid_1), .valid_2(valid_2), .valid_3(valid_3)
);
@ -119,13 +124,34 @@ module tb_top_module_mem;
read_enable_1 = 0;
read_full_row_1 = 0;
// if (full_row_output_1[ (COLS1-1-3)*DATA_WIDTH +: DATA_WIDTH ] == 16'h1234)
// if (full_row_output_1[ (3)*DATA_WIDTH +: DATA_WIDTH ] == 16'h1234)
if (full_row_output_1[ 0*DATA_WIDTH +: DATA_WIDTH ] == 16'h1234)
$display("PASS: Full row read from Matrix 1, element (2,3) is correct: %h", full_row_output_1[ 0*DATA_WIDTH +: DATA_WIDTH ]);
else
$display("FAIL: Full row read wrong value at (2,3)");
// === Write full row into Matrix 1 ===
write_full_row_1 = 1;
row_addr_1 = 5;
// Writing row 5 with pattern: 0x0100, 0x0200, 0x0300, ..., up to COLS1
for (i = 0; i < COLS1; i = i + 1) begin
full_row_input_1[i*DATA_WIDTH +: DATA_WIDTH] = (i + 1) << 8;
end
#10;
write_full_row_1 = 0;
// === Read full row back from Matrix 1 to verify ===
read_enable_1 = 1;
read_full_row_1 = 1;
row_addr_1 = 5;
#10;
read_enable_1 = 0;
read_full_row_1 = 0;
// === Verification ===
for (i = 0; i < COLS1; i = i + 1) begin
if (full_row_output_1[i*DATA_WIDTH +: DATA_WIDTH] != ((i + 1) << 8)) begin
$display("FAIL: Full row write/read mismatch at column %0d: expected %h, got %h", i, (i + 1) << 8, full_row_output_1[i*DATA_WIDTH +: DATA_WIDTH]);
end else begin
$display("PASS: Full row element (%0d, %0d) correct: %h", 5, i, full_row_output_1[i*DATA_WIDTH +: DATA_WIDTH]);
end
end
// === Finish Test ===
#20;

View file

@ -12,9 +12,10 @@ module pe_array #(
input wire [PE_COLS*DATA_WIDTH-1:0] north_inputs, // Flattened inputs PE_ROWS
input wire [PE_ROWS*DATA_WIDTH-1:0] west_inputs, // Flattened inputs PE_COLS
input wire enable,
input wire output_enable,
input wire initialization,
output wire [OUTPUT_COL*OUTPUT_ROW-1:0] valid,
output reg [OUTPUT_COL*OUTPUT_ROW*2*DATA_WIDTH-1:0] acc_outputs // 2*DATA_WIDTH because of accumulation
output reg [OUTPUT_ROW*DATA_WIDTH-1:0] acc_outputs // 2*DATA_WIDTH because of accumulation
);
// Internal wires for each PE
@ -29,10 +30,12 @@ module pe_array #(
// Delayed north and west inputs
reg [DATA_WIDTH-1:0] north_pipe [0:PE_COLS-1][0:PE_COLS-1]; // [which column][delay stages]
reg [DATA_WIDTH-1:0] west_pipe [0:PE_ROWS-1][0:PE_ROWS-1]; // [which row][delay stages]
wire enable_var [0:PE_ROWS][0:PE_COLS];
wire enable_var [0:PE_ROWS-1][0:PE_COLS-1];
wire output_enable_var [0:PE_ROWS-1][0:PE_COLS-1];
reg [DATA_WIDTH-1:0] delayed_south[0:OUTPUT_COL-1][0:OUTPUT_COL-1]; // [row index][delay stages]
reg [DATA_WIDTH-1:0] delayed_east[0:OUTPUT_ROW-1][0:OUTPUT_ROW-1]; // [row index][delay stages]
integer r, d;
integer m, n;
integer x, y, cycle ;
@ -50,11 +53,10 @@ module pe_array #(
for (r = 0; r < PE_ROWS; r = r + 1)
for (d = 0; d < PE_ROWS; d = d + 1)
delayed_east[r][d] <= 0;
// for (r = 0; r < (OUTPUT_COL*OUTPUT_ROW); r = r + 1)
// valid[r] = 0;
acc_outputs = 0;
cycle = 0;
end else begin
end else begin/// use the clearing logic
// Update north pipeline
for (m = 0; m < PE_COLS; m = m + 1) begin
north_pipe[m][0] <= north_inputs[(m+1)*DATA_WIDTH-1 -: DATA_WIDTH];
@ -67,7 +69,7 @@ module pe_array #(
for (n = 1; n <= m; n = n + 1)
west_pipe[m][n] <= west_pipe[m][n-1];
end
if(valid[0] == 1 ) begin //if condition
if((valid[0] == 1) && ((mode == 2'b01)||(mode == 2'b10))) begin //if condition
for (r = 0; r < PE_COLS; r = r + 1) begin
delayed_south[r][0] <= south_array[PE_ROWS-1][r];//[r][0]; // south_array from column 0
@ -79,20 +81,31 @@ module pe_array #(
for (d = 1; d <= PE_ROWS - r; d = d + 1)// r
delayed_east[r][d] <= delayed_east[r][d-1];
end
end
end else if((valid[((OUTPUT_COL>OUTPUT_ROW)?((OUTPUT_ROW/2)-1):((OUTPUT_COL/2)-1))] == 1) && (mode == 2'b00)) begin //if condition
for (r = 0; r < PE_COLS; r = r + 1) begin
delayed_south[r][0] <= south_array[PE_ROWS-1][r];//[r][0]; // south_array from column 0
for (d = 1; d <= PE_COLS; d = d + 1)// d <= PE_COLS - r
delayed_south[r][d] <= delayed_south[r][d-1];
end
for (r = 0; r < PE_ROWS; r = r + 1) begin
delayed_east[r][0] <= east_array[r][PE_COLS-1];//east_array[PE_ROWS-1][r];//[r][0]; // south_array from column 0
for (d = 1; d <= PE_ROWS; d = d + 1)// d <=PE_ROWS - r
delayed_east[r][d] <= delayed_east[r][d-1];
end
end
//weight-stationary
if (mode == 2'b10 && valid[(OUTPUT_COL*2)-1] == 1 && (cycle < OUTPUT_ROW)) begin // no need for all valid just last element in PE array should be high //(OUTPUT_COL*OUTPUT_ROW)
// for (cycle = 0; cycle < OUTPUT_ROW; cycle = cycle + 1) begin
for (x = 0; x < PE_COLS; x = x + 1) begin
// Now filling for a particular cycle and column
acc_outputs[((x + cycle*OUTPUT_COL) + 1)*2*DATA_WIDTH -1 -: 2*DATA_WIDTH] =
acc_outputs[((x + cycle*OUTPUT_COL) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] =
{ {(DATA_WIDTH){1'b0}}, delayed_south[x][PE_COLS-x-1] };//cycle, PE_COLS-x-1
// Display what's being assigned
$display("Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_south(%0d,%0d):= %0d",
cycle,
x,
((x + cycle * OUTPUT_COL) + 1) * 2 * DATA_WIDTH - 1,
((x + cycle * OUTPUT_COL) + 1) * 2 * DATA_WIDTH - 2 * DATA_WIDTH,
((x + cycle * OUTPUT_COL) + 1) * DATA_WIDTH - 1,
((x + cycle * OUTPUT_COL) + 1) * DATA_WIDTH - DATA_WIDTH,
x, // First argument for the first %0d
(PE_COLS - x - 1), // Second argument for the second %0d
//x,
@ -103,13 +116,12 @@ module pe_array #(
end
cycle = cycle + 1;
end else if (mode == 2'b01 && valid[(OUTPUT_COL*2)] == 1 && (cycle < OUTPUT_COL)) begin // input-stationary (OUTPUT_COL*2)
// for (cycle = 0; cycle < OUTPUT_ROW; cycle = cycle + 1) begin
for (x = 0; x < PE_ROWS; x = x + 1) begin
// Now filling for a particular cycle and column
acc_outputs[((x*OUTPUT_COL + cycle) + 1)*2*DATA_WIDTH -1 -: 2*DATA_WIDTH] =
acc_outputs[((x*OUTPUT_COL + cycle) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] =
{ {(DATA_WIDTH){1'b0}}, delayed_east[x][PE_ROWS-x-1] };//cycle, PE_COLS-x-1
// Display what's being assigned
$display("Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_east(%0d,%0d):= %0d",
/* $display("Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_east(%0d,%0d):= %0d",
cycle,
x,
((x*OUTPUT_COL + cycle) + 1) * 2 * DATA_WIDTH - 1,
@ -119,16 +131,56 @@ module pe_array #(
//x,
delayed_east[x][PE_ROWS - x - 1] // Third argument for the third %0d
);
*/
// end
end
end
cycle = cycle + 1;
end
end else if (mode == 2'b00 && (valid[OUTPUT_COL*(OUTPUT_ROW)-(((OUTPUT_COL>OUTPUT_ROW)?((OUTPUT_ROW/2)):((OUTPUT_COL/2))))] == 1) && (cycle < ((OUTPUT_COL > OUTPUT_ROW)?OUTPUT_ROW:OUTPUT_COL))) begin // output-stationary for HDPE array module
if (OUTPUT_COL > OUTPUT_ROW) begin
for (x = 0; x < PE_COLS; x = x + 1) begin
// Now filling for a particular cycle and column
acc_outputs[((x ) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] =
{ {(DATA_WIDTH){1'b0}}, delayed_south[x][PE_COLS-x-1] };//cycle, PE_COLS-x-1
// Display what's being assigned
///*
$display("o/p_Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_south(%0d,%0d):= %0h",
cycle,
x,
((x ) + 1) * DATA_WIDTH - 1,
((x ) + 1) * DATA_WIDTH - DATA_WIDTH,
x, // First argument for the first %0d
(PE_COLS - x - 1), // Second argument for the second %0d
//x,
delayed_south[x][PE_COLS - x - 1] // Third argument for the third %0d
);
//*/
end
cycle = cycle + 1;
end else begin
for (x = 0; x < PE_ROWS; x = x + 1) begin
// Now filling for a particular cycle and column
acc_outputs[((x ) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] =
{ {(DATA_WIDTH){1'b0}}, delayed_east[x][PE_ROWS-x-1] };//cycle, PE_COLS-x-1
// Display what's being assigned
// /*
$display("o/p_Cycle: %0d, Row: %0d, Index: %0d to %0d, delayed_east(%0d,%0d):= %0h",
cycle,
x,
((x) + 1) * DATA_WIDTH - 1,
((x ) + 1) * DATA_WIDTH - DATA_WIDTH,
x, // First argument for the first %0d
(PE_ROWS - x - 1), // Second argument for the second %0d
//x,
delayed_east[x][PE_ROWS - x - 1] // Third argument for the third %0d
);
// */
end
cycle = cycle + 1;
end
end
end
end
genvar i, j;
generate
for (i = 0; i < PE_ROWS; i = i + 1) begin : row_gen //ROW
@ -143,31 +195,33 @@ module pe_array #(
.rst(rst),
.initialization(initialization),
.enable((j==0) && (i==0) ? enable : enable_var[i][j]),
.output_enable((j==0) && (i==0) ? output_enable : output_enable_var[i][j]),
.clear_acc(1'b0), // No accumulator clearing
.data_in_north((i == 0) ? (initialization ? north_inputs[(j+1)*DATA_WIDTH-1 -: DATA_WIDTH] : north_pipe[j][j])
: south_array[i-1][j]),
.data_in_west((j == 0) ? (initialization ? west_inputs[(i+1)*DATA_WIDTH-1 -: DATA_WIDTH] : west_pipe[i][i])
: east_array[i][j-1]),
// .data_in_north((i == 0) ? north_pipe[j][j] : south_array[i-1][j]),
// .data_in_west((j == 0) ? west_pipe[i][i] : east_array[i][j-1]),
.data_out_south(south_array[i][j]),
.data_out_east(east_array[i][j]),
.pe_row_postion(i[$clog2(OUTPUT_ROW+1)-1:0]),
.pe_col_postion(j[$clog2(OUTPUT_COL+1)-1:0]),
.output_enable_south(output_enable_var[i+1][j]),
.output_enable_east(output_enable_var[i][j+1]),
.enable_south(enable_var[i+1][j]),
.enable_east(enable_var[i][j+1]),
.data_out_south(south_array[i][j]),
.data_out_east(east_array[i][j]),
.mode(mode),
.acc_out(output_array[i][j]),
.valid(valid[(i*PE_COLS) + j])
);
/////// assign acc_outputs[((i*PE_COLS+j)+1)*2*DATA_WIDTH-1 -: 2*DATA_WIDTH] = output_array[i][j];
end
end
endgenerate
/*
///*
//redundent logic for debuging simplicity
genvar k;
generate
@ -178,19 +232,7 @@ module pe_array #(
assign east_array_end[k] = east_array[k][PE_COLS-1]; // Rightmost column
end
endgenerate
*/
// === Accumulator Output Assignment with Conditional Capture ===
always @(*) begin //at rising vedge of clock!!!
if (mode == 2'b00) begin
for (x = 0; x < PE_ROWS; x = x + 1)
for (y = 0; y < PE_COLS; y = y + 1)
acc_outputs[((x*OUTPUT_COL + y)+1)*2*DATA_WIDTH-1 -: 2*DATA_WIDTH] = output_array[x][y];
end
end
//*/
endmodule

View file

@ -11,15 +11,12 @@ module pe_array_tb;
reg rst;
reg [PE_COLS*DATA_WIDTH-1:0] north_inputs; //PE_COLS
// reg [PE_COLS*DATA_WIDTH-1:0] north_inputs; //PE_COLS
reg [PE_ROWS*DATA_WIDTH-1:0] west_inputs; //PE_ROWS
reg enable; // Enable signal for PE 0,0
// wire [PE_ROWS*PE_COLS*2*DATA_WIDTH-1:0] acc_outputs; // Accumulated outputs from all PEs
wire [OUTPUT_COL*OUTPUT_ROW*2*DATA_WIDTH-1:0] acc_outputs; // Accumulated outputs from all PEs
// wire [PE_ROWS*PE_COLS-1:0] valid; // Valid signal for each PE
wire [OUTPUT_COL*OUTPUT_ROW-1:0] valid; // Valid signal for each PE
reg initialization;
reg initialization, output_enable;
integer i, j;
integer cycle_count;
@ -29,7 +26,7 @@ module pe_array_tb;
// Define your input matrices
reg [DATA_WIDTH-1:0] matrix_A [0:OUTPUT_ROW-1][0:COMMON_ROW_COL-1]; // 4x4 matrix
reg [DATA_WIDTH-1:0] matrix_B [0:COMMON_ROW_COL-1][0:OUTPUT_COL-1]; // 4x5 matrix
reg [DATA_WIDTH-1:0] expected_C [0:PE_ROWS-1][0:PE_COLS-1]; // 4x5 output matrix (for checking)
reg [DATA_WIDTH-1:0] expected_C [0:OUTPUT_ROW-1][0:OUTPUT_COL-1]; // 4x5 output matrix (for checking)
pe_array #(
.DATA_WIDTH(DATA_WIDTH),
@ -46,6 +43,7 @@ module pe_array_tb;
.mode(mode),
.initialization(initialization),
.enable(enable),
.output_enable(output_enable),
.valid(valid),
.acc_outputs(acc_outputs)
);
@ -62,6 +60,7 @@ module pe_array_tb;
west_inputs = 0;
enable = 0; // Initially, disable all PEs
initialization = 0;
output_enable = 0;
cycle_count = 0;
#20;
rst = 0;
@ -92,10 +91,10 @@ module pe_array_tb;
// Clear accumulators (after reset)
#10;
// Compute expected output C = A * B
for (i = 0; i < PE_ROWS; i = i + 1) begin
for (j = 0; j < PE_COLS; j = j + 1) begin
for (i = 0; i < OUTPUT_ROW; i = i + 1) begin
for (j = 0; j < OUTPUT_COL; j = j + 1) begin
expected_C[i][j] = 0;
for (p = 0; p < PE_ROWS; p = p + 1) begin // This should be based on PE_ROWS
for (p = 0; p < OUTPUT_ROW; p = p + 1) begin // This should be based on PE_ROWS
expected_C[i][j] = expected_C[i][j] + matrix_A[i][p] * matrix_B[p][j];
#20; // or #40 based on PE array timing
@ -156,6 +155,7 @@ module pe_array_tb;
*/
/*
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// enable = 1;/////////////////////////
@ -205,9 +205,10 @@ module pe_array_tb;
#10; // adjust based on PE array latency
end
*/
///*
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
for (cycle_count = 0; cycle_count < (PE_ROWS + PE_COLS +2); cycle_count = cycle_count + 1) begin//-1
//Output stationary
for (cycle_count = 0; cycle_count < (PE_ROWS + PE_COLS +4); cycle_count = cycle_count + 1) begin//-1
enable = 1;
north_inputs = 0;
west_inputs = 0;
@ -237,6 +238,10 @@ module pe_array_tb;
$display(" NORTH: PE(0,%0d): B[%0d][%0d] = %0h", j, cycle_count, j, matrix_B[cycle_count][j]);
end
end
if(cycle_count >= COMMON_ROW_COL) begin
output_enable <= 1;
$display("output_enable = %h", output_enable);
end
$display("west_inputs = %h", west_inputs);
$display("north_inputs = %h", north_inputs);
@ -245,44 +250,10 @@ module pe_array_tb;
#10; // adjust based on PE array latency
end
//*/
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// for (cycle_count = 0; cycle_count < (PE_ROWS + PE_COLS -1); cycle_count = cycle_count + 1) begin
// north_inputs = 0;
// west_inputs = 0;
// //enable = 0;
// $display("\n==================Cycle %0d feeding start===================:", cycle_count);
// for (i = 0; i < PE_ROWS; i = i + 1) begin
// for (j = 0; j < PE_COLS; j = j + 1) begin
// if (i + j == cycle_count) begin
// enable[i*PE_COLS + j] = 1'b1;
// if (i < PE_ROWS && (cycle_count - i) < COMMON_ROW_COL && (cycle_count - i) >= 0) begin
// west_inputs[(i+1)*DATA_WIDTH-1 -: DATA_WIDTH] = matrix_A[i][cycle_count - i];
// end
// if (j < PE_COLS && (cycle_count - j) < COMMON_ROW_COL && (cycle_count - j) >= 0) begin
// north_inputs[(j+1)*DATA_WIDTH-1 -: DATA_WIDTH] = matrix_B[cycle_count - j][j];
// end
// $display(" PE(%0d,%0d): A[%0d][%0d]=%0h, B[%0d][%0d]=%0h", i, j, i, cycle_count-i, matrix_A[i][cycle_count-i], cycle_count-j, j, matrix_B[cycle_count-j][j]);
// end
// end
// end
// $display("west_inputs = %h ; north_inputs = %h", west_inputs, north_inputs);
// $display("==================Cycle %0d feeding end===================:", cycle_count);
// #10; // or #40 based on PE array timing
// end
// Wait for operations to complete
#100;
@ -302,8 +273,8 @@ module pe_array_tb;
// Display expected result matrix
$display("\nExpected matrix:");
for (i = 0; i < PE_ROWS; i = i + 1) begin
for (j = 0; j < PE_COLS; j = j + 1) begin
for (i = 0; i < OUTPUT_ROW; i = i + 1) begin
for (j = 0; j < OUTPUT_COL; j = j + 1) begin
$write("%0d ", expected_C[i][j]);
end
$display();

View file

@ -1,5 +1,7 @@
module matrix_multiplication_unit_new #(
parameter DATA_WIDTH = 16,
parameter MEM_ROWS = 20,//20 ->5bits //16
parameter MEM_COLS = 80,//80 ->7bits //32SS
parameter PE_ROWS = 16,
parameter PE_COLS = 32,
parameter COMMON_ROW_COL = 4,
@ -19,32 +21,35 @@ module matrix_multiplication_unit_new #(
input valid_mem_input_A,
input valid_mem_input_B,
input [$clog2(PE_ROWS)-1:0] rows_start_add_reading_A,
input [$clog2(PE_COLS)-1:0] cols_start_add_reading_A,
input [$clog2(PE_ROWS)-1:0] rows_start_add_reading_B,
input [$clog2(PE_COLS)-1:0] cols_start_add_reading_B,
input [$clog2(PE_ROWS)-1:0] rows_start_add_writing,
input [$clog2(PE_COLS)-1:0] cols_start_add_writing,
input [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_A,
input [$clog2(MEM_COLS)-1:0] cols_start_add_reading_A,
input [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_B,
input [$clog2(MEM_COLS)-1:0] cols_start_add_reading_B,
input [$clog2(MEM_ROWS)-1:0] rows_start_add_writing,
input [$clog2(MEM_COLS)-1:0] cols_start_add_writing,
input [$clog2(PE_ROWS)-1:0] rows_size_reading_A,
input [$clog2(PE_COLS)-1:0] cols_size_reading_A,
input [$clog2(PE_ROWS)-1:0] rows_size_reading_B,
input [$clog2(PE_COLS)-1:0] cols_size_reading_B,
input [$clog2(MEM_ROWS)-1:0] rows_size_reading_A,
input [$clog2(MEM_COLS)-1:0] cols_size_reading_A,
input [$clog2(MEM_ROWS)-1:0] rows_size_reading_B,
input [$clog2(MEM_COLS)-1:0] cols_size_reading_B,
output reg done,
output reg read_full_row_A,
output reg read_full_row_B,
output reg write_full_row_out,
output reg [$clog2(PE_ROWS)-1:0] row_addr_A,
output reg [$clog2(PE_COLS)-1:0] col_addr_A,
output reg [$clog2(PE_ROWS)-1:0] row_addr_B,
output reg [$clog2(PE_COLS)-1:0] col_addr_B,
output reg [$clog2(PE_ROWS)-1:0] row_addr_out,
output reg [$clog2(PE_COLS)-1:0] col_addr_out,
output reg [$clog2(MEM_ROWS)-1:0] row_addr_A,
output reg [$clog2(MEM_COLS)-1:0] col_addr_A,
output reg [$clog2(MEM_ROWS)-1:0] row_addr_B,
output reg [$clog2(MEM_COLS)-1:0] col_addr_B,
output reg [$clog2(MEM_ROWS)-1:0] row_addr_out,
output reg [$clog2(MEM_COLS)-1:0] col_addr_out,
output reg read_enable_A,
output reg read_enable_B,
output reg write_enable_out,
output reg [OUTPUT_ROW*DATA_WIDTH-1:0] Full_row_out,
output reg [DATA_WIDTH-1:0] data_out
);
@ -57,7 +62,9 @@ parameter IDLE = 0,
COMPUTE = 5,
WRITE = 6,
DONE = 7,
BEFORE_COMPUTE = 8;
BEFORE_COMPUTE = 8,
LOAD_ROW_A_COL_B = 9,
WAIT_VALID_ROW_A_COL_B = 10;
reg [3:0] current_state, next_state;
@ -66,7 +73,7 @@ reg enable_pe_array;
wire [OUTPUT_ROW*OUTPUT_COL-1:0] valid_pe_array;
reg [DATA_WIDTH*OUTPUT_COL-1:0] north_inputs;
reg [DATA_WIDTH*PE_ROWS-1:0] west_inputs;
wire [OUTPUT_ROW*OUTPUT_COL*2*DATA_WIDTH-1:0] acc_outputs;
wire [OUTPUT_ROW*DATA_WIDTH-1:0] acc_outputs;
reg [9:0] compute_counter;
reg [9:0] write_counter, read_counter;
@ -74,6 +81,7 @@ reg [9:0] write_counter, read_counter;
wire [2*DATA_WIDTH-1:0] selected_accum_value;
reg initialization;
reg output_enable;
// Instantiate PE Array
@ -91,13 +99,12 @@ pe_array #(
.initialization(initialization),
.north_inputs(north_inputs),
.west_inputs(west_inputs),
.output_enable(output_enable),
.enable(enable_pe_array),
.valid(valid_pe_array),
.acc_outputs(acc_outputs)
);
// Assign selected output slice for writing
assign selected_accum_value = acc_outputs[(write_counter+1)*2*DATA_WIDTH-1 -: 2*DATA_WIDTH];
// FSM Sequential
always @(posedge clk or posedge rst) begin
@ -109,6 +116,7 @@ end
// FSM Next-State Logic
always @(*) begin
case (current_state)
IDLE: next_state = enable ? LOAD_ROW_A : IDLE;
LOAD_ROW_A: next_state = WAIT_VALID_ROW_A;
@ -116,12 +124,13 @@ always @(*) begin
LOAD_COL_B: next_state = WAIT_VALID_COL_B;
WAIT_VALID_COL_B: next_state = valid_mem_input_B ? BEFORE_COMPUTE : WAIT_VALID_COL_B;//COMPUTE
BEFORE_COMPUTE : next_state = COMPUTE;
COMPUTE: next_state = (read_counter < COMMON_ROW_COL) ? LOAD_ROW_A :(compute_counter >= PE_ROWS + OUTPUT_COL - 1) ? WRITE : COMPUTE; //
// COMPUTE: next_state = (compute_counter >= PE_ROWS + OUTPUT_COL - 1) ? WRITE : COMPUTE; //
WRITE: next_state = (write_counter >= (PE_ROWS * OUTPUT_COL)) ? DONE : WRITE;
COMPUTE: next_state = (read_counter < COMMON_ROW_COL) ? LOAD_ROW_A :(compute_counter >= PE_ROWS + PE_COLS + COMMON_ROW_COL+1) ? WRITE : COMPUTE; // PE_ROWS+PE_COLS+4
WRITE: next_state = (write_counter >= ((OUTPUT_COL>OUTPUT_ROW)?OUTPUT_ROW-1:OUTPUT_COL-1)) ? DONE : WRITE;
DONE: next_state = DONE;
default: next_state = IDLE;
endcase
end
// FSM Outputs
@ -136,6 +145,9 @@ always @(posedge clk or posedge rst) begin
write_enable_out <= 0;
read_counter <= 0;
initialization <= 0;
output_enable <= 0;
write_full_row_out <= 0;
Full_row_out <= 0;
end else begin
case (current_state)
IDLE: begin
@ -147,6 +159,8 @@ always @(posedge clk or posedge rst) begin
done <= 0;
write_enable_out <= 0;
read_counter <= 0;
write_full_row_out <= 0;
Full_row_out <= 0;
$display("[IDLE] Waiting for enable...");
end
@ -154,6 +168,7 @@ always @(posedge clk or posedge rst) begin
row_addr_A <= rows_start_add_reading_A;
col_addr_A <= cols_start_add_reading_A + read_counter;//
read_full_row_A <= 1;
enable_pe_array <= 0;
$display("[LOAD_ROW_A] Reading full row A.");
end
@ -178,6 +193,7 @@ always @(posedge clk or posedge rst) begin
read_counter <= read_counter + 1;
end
end
BEFORE_COMPUTE: begin
north_inputs <= full_row_B[OUTPUT_COL*DATA_WIDTH-1:0];
west_inputs <= full_row_A;
@ -190,21 +206,24 @@ always @(posedge clk or posedge rst) begin
enable_pe_array <= 1;
compute_counter <= compute_counter + 1;
$display("[COMPUTE] Cycle %0d / %0d", compute_counter, PE_ROWS+OUTPUT_COL-1);
if(compute_counter >= COMMON_ROW_COL) begin
output_enable <= 1;
$display("output_enable = %h", output_enable);
end
end
WRITE: begin
enable_pe_array <= 1;
write_enable_out <= valid_pe_array[write_counter];
row_addr_out <= (write_counter / OUTPUT_COL) + rows_start_add_writing;
col_addr_out <= (write_counter % OUTPUT_COL) + cols_start_add_writing;/////
row_addr_out <= (write_counter ) + rows_start_add_writing;
col_addr_out <= cols_start_add_writing;/////
// row_addr_out <= (write_counter / OUTPUT_COL) + rows_start_add_writing;
// col_addr_out <= (write_counter % OUTPUT_COL) + cols_start_add_writing;/////
data_out <= selected_accum_value[DATA_WIDTH-1:0];
Full_row_out <= acc_outputs;////
write_full_row_out <= 1;
$display("[WRITE] Writing output[%0d][%0d] = %0h | Valid = %b",
(write_counter / OUTPUT_COL), (write_counter % OUTPUT_COL), data_out, valid_pe_array[write_counter]);
(write_counter), 0, acc_outputs, valid_pe_array[write_counter]);
// $display("[WRITE] Writing output[%0d][%0d] = %0h | Valid = %b",
// row_addr_out, col_addr_out, data_out, valid_pe_array[write_counter]);
@ -215,6 +234,7 @@ always @(posedge clk or posedge rst) begin
done <= 1;
enable_pe_array <= 0;
compute_counter <= 0;
write_full_row_out <= 0;
$display("[DONE] Matrix multiplication completed.");
end

View file

@ -4,10 +4,10 @@ module tb_matrix_multiplication;
// Parameters
parameter DATA_WIDTH = 16;
parameter ROWS = 20;//20 ->5bits //16
parameter COLS = 80;//80 ->7bits //32SS
/// parameter COLS_USED = 4;//3 // this given to rows_size_reading_B & cols_size_reading_A
// parameter OUTPUT_COLS = 10;
parameter MEM_ROWS = 20;//20 ->5bits //16
parameter MEM_COLS = 80;//80 ->7bits //32SS
parameter PE_ROWS = 20;//
parameter PE_COLS = 10;//
parameter COMMON_ROW_COL = 4;
parameter OUTPUT_COL = 10;
parameter OUTPUT_ROW = 20;
@ -26,57 +26,57 @@ module tb_matrix_multiplication;
reg valid_mem_input_B;
// Address offset configuration
reg [$clog2(ROWS)-1:0] rows_start_add_reading_A;
reg [$clog2(COLS)-1:0] cols_start_add_reading_A;
reg [$clog2(ROWS)-1:0] rows_start_add_reading_B;
reg [$clog2(COLS)-1:0] cols_start_add_reading_B;
reg [$clog2(ROWS)-1:0] rows_start_add_writing;
reg [$clog2(COLS)-1:0] cols_start_add_writing;
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_A;
reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_A;
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_B;
reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_B;
reg [$clog2(MEM_ROWS)-1:0] rows_start_add_writing;
reg [$clog2(MEM_COLS)-1:0] cols_start_add_writing;
// Size configuration
reg [$clog2(ROWS)-1:0] rows_size_reading_A;
reg [$clog2(COLS)-1:0] cols_size_reading_A;
reg [$clog2(ROWS)-1:0] rows_size_reading_B;
reg [$clog2(COLS)-1:0] cols_size_reading_B;
reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_A;
reg [$clog2(MEM_COLS)-1:0] cols_size_reading_A;
reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_B;
reg [$clog2(MEM_COLS)-1:0] cols_size_reading_B;
// Outputs
wire done;
wire [$clog2(ROWS)-1:0] row_addr_A;
wire [$clog2(COLS)-1:0] col_addr_A;
wire [$clog2(ROWS)-1:0] row_addr_B;
wire [$clog2(COLS)-1:0] col_addr_B;
wire [$clog2(ROWS)-1:0] row_addr_out;
wire [$clog2(COLS)-1:0] col_addr_out;
wire [$clog2(MEM_ROWS)-1:0] row_addr_A;
wire [$clog2(MEM_COLS)-1:0] col_addr_A;
wire [$clog2(MEM_ROWS)-1:0] row_addr_B;
wire [$clog2(MEM_COLS)-1:0] col_addr_B;
wire [$clog2(MEM_ROWS)-1:0] row_addr_out;
wire [$clog2(MEM_COLS)-1:0] col_addr_out;
wire read_enable_A;
wire read_enable_B;
wire write_enable_out;
wire read_full_row_A, read_full_row_B;
wire read_full_row_A, read_full_row_B,write_full_row_1, write_full_row_2, write_full_row_3;
reg read_full_row_3;
wire [DATA_WIDTH-1:0] data_out;
// Unused memory interfaces
reg [$clog2(ROWS)-1:0] row_addr_1, row_addr_2, row_addr_3; //reg
reg [$clog2(COLS)-1:0] col_addr_1, col_addr_2, col_addr_3;
reg [$clog2(MEM_ROWS)-1:0] row_addr_1, row_addr_2, row_addr_3; //reg
reg [$clog2(MEM_COLS)-1:0] col_addr_1, col_addr_2, col_addr_3;
wire [DATA_WIDTH-1:0] data_out_1, data_out_2, data_in_3;
reg [DATA_WIDTH-1:0] data_out_3;
wire valid_1, valid_2, valid_3;
reg write_enable_3, read_enable_3;//reg is instead of wire so that multiple inputs can be driven through them.
// wire [7:0] no_rows1, no_cols1;
wire [DATA_WIDTH*ROWS-1:0] full_row_output_1, full_row_output_2, full_row_output_3; ///
reg [DATA_WIDTH*ROWS-1:0]full_row_A, full_row_B;//reg
wire [DATA_WIDTH*PE_ROWS-1:0] full_row_output_1, full_row_output_2, full_row_output_3, full_row_input_1, full_row_input_2, full_row_input_3; ///
reg [DATA_WIDTH*PE_ROWS-1:0]full_row_A, full_row_B;//reg
integer cycle_count;
// Instantiate the memory module
top_module_mem #(
.ROWS1(ROWS),
.COLS1(COLS),
.ROWS2(ROWS),
.COLS2(COLS),
.ROWS3(ROWS),
.COLS3(COLS),
.ROWS1(MEM_ROWS),
.COLS1(MEM_COLS),
.ROWS2(MEM_ROWS),
.COLS2(MEM_COLS),
.ROWS3(MEM_ROWS),
.COLS3(MEM_COLS),
.DATA_WIDTH(DATA_WIDTH),
.COLS_USED(COMMON_ROW_COL)
) memory_inst (
@ -91,7 +91,9 @@ module tb_matrix_multiplication;
.data_output_1(data_out_1),
.valid_1(valid_1),
.read_full_row_1(1'b0),
.full_row_output_1(full_row_output_1),
.full_row_output_1(full_row_output_1),
.full_row_input_1(full_row_input_1),////
.write_full_row_1(write_full_row_1),///
// Memory 2 (not used)
.row_addr_2(row_addr_2),
@ -103,6 +105,8 @@ module tb_matrix_multiplication;
.valid_2(valid_2),
.read_full_row_2(1'b0),
.full_row_output_2(full_row_output_2),
.full_row_input_2(full_row_input_2),////
.write_full_row_2(write_full_row_2),///
// Memory 3 (used for matrix multiplication)
.row_addr_3(row_addr_3),
@ -113,18 +117,20 @@ module tb_matrix_multiplication;
.data_output_3(data_in_3),
.valid_3(valid_3),
.read_full_row_3(read_full_row_3),
.full_row_output_3(full_row_output_3)
.full_row_output_3(full_row_output_3),
.full_row_input_3(full_row_input_3),////
.write_full_row_3(write_full_row_3)///
);
// Instantiate the matrix multiplication unit
/// matrix_multiplication_unit #(
matrix_multiplication_unit_new #(
/////matrix_multiplication_unit_new_v1 #(
.DATA_WIDTH(DATA_WIDTH),
.PE_ROWS(ROWS),
.PE_COLS(COLS),
.MEM_ROWS(MEM_ROWS),
.MEM_COLS(MEM_COLS),
.PE_ROWS(PE_ROWS),
.PE_COLS(PE_COLS),
.COMMON_ROW_COL(COMMON_ROW_COL),
// .OUTPUT_COLS(OUTPUT_COLS),
// .OUTPUT_MEM_COLS(OUTPUT_MEM_COLS),
.OUTPUT_COL(OUTPUT_COL),
.OUTPUT_ROW(OUTPUT_ROW)
) mmu_inst (
@ -167,7 +173,9 @@ module tb_matrix_multiplication;
.full_row_A(full_row_A),
.full_row_B(full_row_B),
.read_full_row_A(read_full_row_A),
.read_full_row_B(read_full_row_B)
.read_full_row_B(read_full_row_B),
.write_full_row_out(write_full_row_3),
.Full_row_out(full_row_input_3)
);
// Clock Generation
@ -197,8 +205,6 @@ module tb_matrix_multiplication;
full_row_B = full_row_output_3;//<= changed to =
///inputs to hdpe
// full_row_A <= full_row_output_3;//<=
// full_row_B <= full_row_output_3;
valid_mem_input_A = valid_3;
valid_mem_input_B = valid_3;
@ -221,8 +227,6 @@ module tb_matrix_multiplication;
// Initialize control signals
rst = 1;
enable = 0;
//mode = 2'b01; // input-Stationary mode
// mode = 2'b10; // weight-Stationary mode
mode = 2'b00; // Output-Stationary mode
// Wait for a few clock cycles
@ -233,10 +237,7 @@ module tb_matrix_multiplication;
#20;
//assigning register properly
//inputs to hdpe
// data_input_A <= data_in_3;
// data_input_B <= data_in_3;
// valid_mem_input_A <= valid_3;
// valid_mem_input_B <= valid_3;
rows_start_add_reading_A <= 5'b0;
cols_start_add_reading_A <= 7'b0;
rows_start_add_reading_B <= 5'b0;
@ -249,17 +250,7 @@ module tb_matrix_multiplication;
rows_size_reading_B <= COMMON_ROW_COL;
cols_size_reading_B <= 7'd9; //(B-> 10X4)^T
//outputs to hdpe
// row_addr_3 <= row_addr_A;
// col_addr_3 <= col_addr_A;
// row_addr_3 <= row_addr_B;
// col_addr_3 <= col_addr_B;
// row_addr_3 <= row_addr_out;
// col_addr_3 <= row_addr_out;
// read_enable_3 <= read_enable_A;
// read_enable_3 <= read_enable_B;
// write_enable_3 <= write_enable_out;
// data_in_3 <= data_out;
// Enable the matrix multiplication
enable = 1;
@ -270,7 +261,7 @@ module tb_matrix_multiplication;
wait (done);
// Print number of cycles taken
$display("Operation completed in %0d cycles", cycle_count); //1261 cycles-> old o/p sationary implemenetaion
$display("Operation completed in %0d cycles", cycle_count); //261 cycles-> old o/p sationary implemenetaion
// Disable the enable signal

View file

@ -9,6 +9,9 @@ module processing_element #(
input enable,
input [1:0] mode,// 00: Output-Stationary, 01: Input-Stationary, 10: Weight-Stationary
input initialization,
input output_enable,
input [$clog2(OUTPUT_ROW+1)-1:0]pe_row_postion,
input [$clog2(OUTPUT_COL+1)-1:0]pe_col_postion,
// Inputs from the north and west
input [DATA_WIDTH-1:0] data_in_north, // A matrix element
@ -21,6 +24,9 @@ module processing_element #(
output reg enable_south,
output reg enable_east,
output reg output_enable_south,
output reg output_enable_east,
// Accumulated output
output reg [2*DATA_WIDTH-1:0] acc_out,
output reg valid,
@ -30,7 +36,7 @@ module processing_element #(
);
reg [2*DATA_WIDTH-1:0] acc;
reg [$clog2(COMMON_ROW_COL+1)-1:0] count_acc;
reg [$clog2(COMMON_ROW_COL+1):0] count_acc;//-1
reg [$clog2(OUTPUT_COL+1)-1:0] count_col;
reg [$clog2(OUTPUT_ROW+1)-1:0] count_row;
reg [DATA_WIDTH-1:0] data_in_west_reg, data_in_north_reg;
@ -50,6 +56,8 @@ always @(posedge clk or posedge rst) begin
data_in_north_reg <= 0;
enable_south <= 0;
enable_east <= 0;
output_enable_south <= 0;
output_enable_east <= 0;
end else begin
if (clear_acc) begin
acc <= 0;
@ -59,7 +67,10 @@ always @(posedge clk or posedge rst) begin
count_row <= 0;
data_in_west_reg <= 0;
data_in_north_reg <= 0;
// end else if ( (enable == 1) && (count_acc < COMMON_ROW_COL) ) begin///
enable_south <= 0;
enable_east <= 0;
output_enable_south <= 0;
output_enable_east <= 0;
end else begin
case(mode)
2'b00: begin//output staionary
@ -68,25 +79,47 @@ always @(posedge clk or posedge rst) begin
acc <= acc + data_in_north * data_in_west;
count_acc <= count_acc + 1;
// $display("north =%0h , west = %0h, count_acc %d, acc =%0h ", data_in_north, data_in_west, count_acc, acc);///////////////////
end
// $display("north =%0h , west = %0h, count_acc %d, acc =%0h ", data_in_north, data_in_west, count_acc, acc);///////////////////
// Forward the inputs
data_out_south <= data_in_north;
data_out_east <= data_in_west;
enable_south <= enable;
enable_east <= enable;
end else begin
enable_south <= enable;/////
enable_east <= enable;/////
end
// Display all the port values
// Forward the inputs
data_out_south <= data_in_north;
data_out_east <= data_in_west;
enable_south <= enable;
enable_east <= enable;
if(count_acc == COMMON_ROW_COL) begin ///COMMON_ROW_COL-1
// if(count_acc > COMMON_ROW_COL) begin ///COMMON_ROW_COL-1 //>
if ( (enable == 1) && (count_acc >= COMMON_ROW_COL-1)) begin
acc_out <= acc;
valid <= 1;
count_acc <= count_acc + 1;
if (OUTPUT_COL > OUTPUT_ROW) begin
data_out_south <= (output_enable == 1)? acc: data_in_north;
if (count_acc == (COMMON_ROW_COL+pe_row_postion+1)) begin //-1 reason fo r the XXX
output_enable_south <= output_enable;
end
if((pe_row_postion == 0) && (count_acc == (COMMON_ROW_COL+pe_row_postion+1)))begin
output_enable_east <= output_enable;
end else if (count_acc == (COMMON_ROW_COL+pe_row_postion)) begin
output_enable_east <= output_enable;
end
end else begin
data_out_east <= (output_enable == 1)? acc: data_in_west;
if (count_acc == (COMMON_ROW_COL+pe_col_postion+1)) begin //-1
output_enable_east <= output_enable;
end
if( (pe_col_postion == 0) && (count_acc == (COMMON_ROW_COL+pe_col_postion+1))) begin
output_enable_south <= output_enable;
end else if (count_acc == (COMMON_ROW_COL+pe_col_postion)) begin
output_enable_south <= output_enable;
end
end
if ( output_enable == 1)begin/////
valid <= 1;
end
count_acc <= count_acc + 1;
end
// $display("Time: %t, Data In North: %d, Data In West: %d, Data Out South: %d, Data Out East: %d, Accumulator: %d, Acc Out: %d",
// $time, data_in_north, data_in_west, data_out_south, data_out_east, acc, acc_out);
end
2'b01:begin //Input-Stationary
@ -101,13 +134,11 @@ always @(posedge clk or posedge rst) begin
count_col <= count_col + 1;
// $display("north =%0h , west = %0h, count_col %d, data_out_east =%0h ", data_in_north, data_in_west, count_col, data_out_east);///////////////////
// end
// Display all the port values
// Forward the inputs
data_out_south <= data_in_north;
//data_out_east <= acc[DATA_WIDTH-1:0];
enable_south <= enable;
enable_east <= enable;
end
@ -135,7 +166,6 @@ always @(posedge clk or posedge rst) begin
// Display all the port values
// Forward the inputs
//data_out_south <= acc[DATA_WIDTH-1:0];
data_out_east <= data_in_west;
enable_south <= enable;
enable_east <= enable;