diff --git a/2d_memory_block.v b/2d_memory_block.v index 7630ffc..aec8967 100644 --- a/2d_memory_block.v +++ b/2d_memory_block.v @@ -13,17 +13,14 @@ module matrix_memory_flexible #( input write_enable, input read_enable, input read_full_row, // 1 = read full row, 0 = read single element - input write_full_row, input [$clog2(ROWS)-1:0] row, input [$clog2(COLS)-1:0] col, // Data ports input [DATA_WIDTH-1:0] data_in, - input [DATA_WIDTH*ROWS-1:0] full_row_in, //COLS_USED output reg [DATA_WIDTH-1:0] data_out, output reg [DATA_WIDTH*ROWS-1:0] full_row_out, //COLS_USED - // Output valid signal output reg valid ); @@ -51,17 +48,17 @@ module matrix_memory_flexible #( always @(posedge clk) begin valid <= 0; // Default invalid unless read - if (write_full_row) begin - for (i = 0; i < ROWS; i = i + 1) begin //COLS_USED - mem[base_row_addr + i + col] <= full_row_in[i*DATA_WIDTH +: DATA_WIDTH]; - ///$display("Full Row Input(%d): %0h", i, full_row_out); - end - end else if (write_enable) begin + + if (write_enable) begin mem[addr_single] <= data_in; end - else if (read_full_row) begin + + if (read_full_row) begin for (i = 0; i < ROWS; i = i + 1) begin //COLS_USED + ////for (i = col; i < (COLS_USED + col); i = i + 1) begin + //for (i = integer'(col); i < (COLS_USED + integer'(col)); i = i + 1) begin full_row_out[i*DATA_WIDTH +: DATA_WIDTH] <= mem[base_row_addr + i + col]; + //// full_row_out[i*DATA_WIDTH +: DATA_WIDTH] <= mem[base_row_addr + i]; ///$display("Full Row Output(%d): %0h", i, full_row_out); end valid <= 1; diff --git a/Memory.v b/Memory.v index c69f6d5..c98fb3c 100644 --- a/Memory.v +++ b/Memory.v @@ -22,12 +22,10 @@ module top_module_mem #( input write_enable_1, write_enable_2, write_enable_3, input read_enable_1, read_enable_2, read_enable_3, input read_full_row_1, read_full_row_2, read_full_row_3, // NEW signals - input write_full_row_1, write_full_row_2, write_full_row_3, // NEW signals + // Data Inputs input [DATA_WIDTH-1:0] data_input_1, data_input_2, data_input_3, - input [DATA_WIDTH*ROWS1-1:0] full_row_input_1, - input [DATA_WIDTH*ROWS2-1:0] full_row_input_2, - input [DATA_WIDTH*ROWS3-1:0] full_row_input_3, + // Data Outputs output [DATA_WIDTH-1:0] data_output_1, data_output_2, data_output_3, output [DATA_WIDTH*ROWS1-1:0] full_row_output_1, // For full row reads //COLS1 @@ -42,15 +40,12 @@ module top_module_mem #( matrix_memory_flexible #( .ROWS(ROWS1), .COLS(COLS1), - .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_input.hex"), - // .INIT_FILE("/mnt/d/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_input.hex"), + .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_input.hex"), .DATA_WIDTH(DATA_WIDTH), .COLS_USED(COLS_USED) ) u_matrix_mem_1 ( .clk(clk), .write_enable(write_enable_1), - .write_full_row(write_full_row_1), - .full_row_in(full_row_input_1), .read_enable(read_enable_1), .read_full_row(read_full_row_1), .row(row_addr_1), @@ -66,15 +61,12 @@ module top_module_mem #( .ROWS(ROWS2), .COLS(COLS2), .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_weight.hex"), - // .INIT_FILE("/mnt/d/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_weight.hex"), .DATA_WIDTH(DATA_WIDTH), .COLS_USED(COLS_USED) ) u_matrix_mem_2 ( .clk(clk), .write_enable(write_enable_2), .read_enable(read_enable_2), - .write_full_row(write_full_row_2), - .full_row_in(full_row_input_2), .read_full_row(read_full_row_2), .row(row_addr_2), .col(col_addr_2), @@ -88,16 +80,14 @@ module top_module_mem #( matrix_memory_flexible #( .ROWS(ROWS3), .COLS(COLS3), - .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80.hex"), - // .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80_ones.hex"), + // .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80.hex"), + .INIT_FILE("D:/Project_verilog_item/project_files/hdl_verilog/phase4/matrix_data_temp_20_X_80_ones.hex"), .DATA_WIDTH(DATA_WIDTH), .COLS_USED(COLS_USED) ) u_matrix_mem_3 ( .clk(clk), .write_enable(write_enable_3), .read_enable(read_enable_3), - .write_full_row(write_full_row_3), - .full_row_in(full_row_input_3), .read_full_row(read_full_row_3), .row(row_addr_3), .col(col_addr_3), diff --git a/Memory_tb.v b/Memory_tb.v index 48b17ae..abf8d9d 100644 --- a/Memory_tb.v +++ b/Memory_tb.v @@ -25,20 +25,17 @@ module tb_top_module_mem; reg write_enable_1, write_enable_2, write_enable_3; reg read_enable_1, read_enable_2, read_enable_3; reg read_full_row_1, read_full_row_2, read_full_row_3; - reg write_full_row_1, write_full_row_2, write_full_row_3; reg [DATA_WIDTH-1:0] data_input_1, data_input_2, data_input_3; wire [DATA_WIDTH-1:0] data_output_1, data_output_2, data_output_3; - wire [DATA_WIDTH*ROWS1-1:0] full_row_output_1; //COLS1 + // wire [DATA_WIDTH*COLS1-1:0] full_row_output_1; //COLS1 + // wire [DATA_WIDTH*COLS2-1:0] full_row_output_2; //COLS2 + // wire [DATA_WIDTH*COLS3-1:0] full_row_output_3; //COLS3 + wire [DATA_WIDTH*ROWS1-1:0] full_row_output_1; //COLS1 wire [DATA_WIDTH*ROWS2-1:0] full_row_output_2; //COLS2 wire [DATA_WIDTH*ROWS3-1:0] full_row_output_3; //COLS3 - reg [DATA_WIDTH*ROWS1-1:0] full_row_input_1; //COLS1 - reg [DATA_WIDTH*ROWS2-1:0] full_row_input_2; //COLS2 - reg [DATA_WIDTH*ROWS3-1:0] full_row_input_3; //COLS3 wire valid_1, valid_2, valid_3; - integer i; - // Instantiate DUT top_module_mem #( .ROWS1(ROWS1), @@ -56,11 +53,9 @@ module tb_top_module_mem; .row_addr_3(row_addr_3), .col_addr_3(col_addr_3), .write_enable_1(write_enable_1), .write_enable_2(write_enable_2), .write_enable_3(write_enable_3), .read_enable_1(read_enable_1), .read_enable_2(read_enable_2), .read_enable_3(read_enable_3), - .write_full_row_1(write_full_row_1), .write_full_row_2(write_full_row_2), .write_full_row_3(write_full_row_3), .read_full_row_1(read_full_row_1), .read_full_row_2(read_full_row_2), .read_full_row_3(read_full_row_3), .data_input_1(data_input_1), .data_input_2(data_input_2), .data_input_3(data_input_3), .data_output_1(data_output_1), .data_output_2(data_output_2), .data_output_3(data_output_3), - .full_row_input_1(full_row_input_1), .full_row_input_2(full_row_input_2), .full_row_input_3(full_row_input_3), .full_row_output_1(full_row_output_1), .full_row_output_2(full_row_output_2), .full_row_output_3(full_row_output_3), .valid_1(valid_1), .valid_2(valid_2), .valid_3(valid_3) ); @@ -124,34 +119,13 @@ module tb_top_module_mem; read_enable_1 = 0; read_full_row_1 = 0; + // if (full_row_output_1[ (COLS1-1-3)*DATA_WIDTH +: DATA_WIDTH ] == 16'h1234) + // if (full_row_output_1[ (3)*DATA_WIDTH +: DATA_WIDTH ] == 16'h1234) + if (full_row_output_1[ 0*DATA_WIDTH +: DATA_WIDTH ] == 16'h1234) - // === Write full row into Matrix 1 === - write_full_row_1 = 1; - row_addr_1 = 5; - // Writing row 5 with pattern: 0x0100, 0x0200, 0x0300, ..., up to COLS1 - for (i = 0; i < COLS1; i = i + 1) begin - full_row_input_1[i*DATA_WIDTH +: DATA_WIDTH] = (i + 1) << 8; - end - #10; - write_full_row_1 = 0; - - // === Read full row back from Matrix 1 to verify === - read_enable_1 = 1; - read_full_row_1 = 1; - row_addr_1 = 5; - #10; - read_enable_1 = 0; - read_full_row_1 = 0; - - // === Verification === - for (i = 0; i < COLS1; i = i + 1) begin - if (full_row_output_1[i*DATA_WIDTH +: DATA_WIDTH] != ((i + 1) << 8)) begin - $display("FAIL: Full row write/read mismatch at column %0d: expected %h, got %h", i, (i + 1) << 8, full_row_output_1[i*DATA_WIDTH +: DATA_WIDTH]); - end else begin - $display("PASS: Full row element (%0d, %0d) correct: %h", 5, i, full_row_output_1[i*DATA_WIDTH +: DATA_WIDTH]); - end - end - + $display("PASS: Full row read from Matrix 1, element (2,3) is correct: %h", full_row_output_1[ 0*DATA_WIDTH +: DATA_WIDTH ]); + else + $display("FAIL: Full row read wrong value at (2,3)"); // === Finish Test === #20; diff --git a/PE_array_module.v b/PE_array_module.v index 3f85846..567fb1f 100644 --- a/PE_array_module.v +++ b/PE_array_module.v @@ -12,10 +12,9 @@ module pe_array #( input wire [PE_COLS*DATA_WIDTH-1:0] north_inputs, // Flattened inputs PE_ROWS input wire [PE_ROWS*DATA_WIDTH-1:0] west_inputs, // Flattened inputs PE_COLS input wire enable, - input wire output_enable, input wire initialization, output wire [OUTPUT_COL*OUTPUT_ROW-1:0] valid, - output reg [OUTPUT_ROW*DATA_WIDTH-1:0] acc_outputs // 2*DATA_WIDTH because of accumulation + output reg [OUTPUT_COL*OUTPUT_ROW*2*DATA_WIDTH-1:0] acc_outputs // 2*DATA_WIDTH because of accumulation ); // Internal wires for each PE @@ -30,12 +29,10 @@ module pe_array #( // Delayed north and west inputs reg [DATA_WIDTH-1:0] north_pipe [0:PE_COLS-1][0:PE_COLS-1]; // [which column][delay stages] reg [DATA_WIDTH-1:0] west_pipe [0:PE_ROWS-1][0:PE_ROWS-1]; // [which row][delay stages] - wire enable_var [0:PE_ROWS-1][0:PE_COLS-1]; - wire output_enable_var [0:PE_ROWS-1][0:PE_COLS-1]; + wire enable_var [0:PE_ROWS][0:PE_COLS]; reg [DATA_WIDTH-1:0] delayed_south[0:OUTPUT_COL-1][0:OUTPUT_COL-1]; // [row index][delay stages] reg [DATA_WIDTH-1:0] delayed_east[0:OUTPUT_ROW-1][0:OUTPUT_ROW-1]; // [row index][delay stages] - integer r, d; integer m, n; integer x, y, cycle ; @@ -53,10 +50,11 @@ module pe_array #( for (r = 0; r < PE_ROWS; r = r + 1) for (d = 0; d < PE_ROWS; d = d + 1) delayed_east[r][d] <= 0; - + // for (r = 0; r < (OUTPUT_COL*OUTPUT_ROW); r = r + 1) + // valid[r] = 0; acc_outputs = 0; cycle = 0; - end else begin/// use the clearing logic + end else begin // Update north pipeline for (m = 0; m < PE_COLS; m = m + 1) begin north_pipe[m][0] <= north_inputs[(m+1)*DATA_WIDTH-1 -: DATA_WIDTH]; @@ -69,7 +67,7 @@ module pe_array #( for (n = 1; n <= m; n = n + 1) west_pipe[m][n] <= west_pipe[m][n-1]; end - if((valid[0] == 1) && ((mode == 2'b01)||(mode == 2'b10))) begin //if condition + if(valid[0] == 1 ) begin //if condition for (r = 0; r < PE_COLS; r = r + 1) begin delayed_south[r][0] <= south_array[PE_ROWS-1][r];//[r][0]; // south_array from column 0 @@ -81,31 +79,20 @@ module pe_array #( for (d = 1; d <= PE_ROWS - r; d = d + 1)// r delayed_east[r][d] <= delayed_east[r][d-1]; end - - end else if((valid[((OUTPUT_COL>OUTPUT_ROW)?((OUTPUT_ROW/2)-1):((OUTPUT_COL/2)-1))] == 1) && (mode == 2'b00)) begin //if condition - for (r = 0; r < PE_COLS; r = r + 1) begin - delayed_south[r][0] <= south_array[PE_ROWS-1][r];//[r][0]; // south_array from column 0 - for (d = 1; d <= PE_COLS; d = d + 1)// d <= PE_COLS - r - delayed_south[r][d] <= delayed_south[r][d-1]; - end - for (r = 0; r < PE_ROWS; r = r + 1) begin - delayed_east[r][0] <= east_array[r][PE_COLS-1];//east_array[PE_ROWS-1][r];//[r][0]; // south_array from column 0 - for (d = 1; d <= PE_ROWS; d = d + 1)// d <=PE_ROWS - r - delayed_east[r][d] <= delayed_east[r][d-1]; - end - end + end //weight-stationary if (mode == 2'b10 && valid[(OUTPUT_COL*2)-1] == 1 && (cycle < OUTPUT_ROW)) begin // no need for all valid just last element in PE array should be high //(OUTPUT_COL*OUTPUT_ROW) + // for (cycle = 0; cycle < OUTPUT_ROW; cycle = cycle + 1) begin for (x = 0; x < PE_COLS; x = x + 1) begin // Now filling for a particular cycle and column - acc_outputs[((x + cycle*OUTPUT_COL) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] = + acc_outputs[((x + cycle*OUTPUT_COL) + 1)*2*DATA_WIDTH -1 -: 2*DATA_WIDTH] = { {(DATA_WIDTH){1'b0}}, delayed_south[x][PE_COLS-x-1] };//cycle, PE_COLS-x-1 // Display what's being assigned $display("Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_south(%0d,%0d):= %0d", cycle, x, - ((x + cycle * OUTPUT_COL) + 1) * DATA_WIDTH - 1, - ((x + cycle * OUTPUT_COL) + 1) * DATA_WIDTH - DATA_WIDTH, + ((x + cycle * OUTPUT_COL) + 1) * 2 * DATA_WIDTH - 1, + ((x + cycle * OUTPUT_COL) + 1) * 2 * DATA_WIDTH - 2 * DATA_WIDTH, x, // First argument for the first %0d (PE_COLS - x - 1), // Second argument for the second %0d //x, @@ -116,12 +103,13 @@ module pe_array #( end cycle = cycle + 1; end else if (mode == 2'b01 && valid[(OUTPUT_COL*2)] == 1 && (cycle < OUTPUT_COL)) begin // input-stationary (OUTPUT_COL*2) + // for (cycle = 0; cycle < OUTPUT_ROW; cycle = cycle + 1) begin for (x = 0; x < PE_ROWS; x = x + 1) begin // Now filling for a particular cycle and column - acc_outputs[((x*OUTPUT_COL + cycle) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] = + acc_outputs[((x*OUTPUT_COL + cycle) + 1)*2*DATA_WIDTH -1 -: 2*DATA_WIDTH] = { {(DATA_WIDTH){1'b0}}, delayed_east[x][PE_ROWS-x-1] };//cycle, PE_COLS-x-1 // Display what's being assigned - /* $display("Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_east(%0d,%0d):= %0d", + $display("Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_east(%0d,%0d):= %0d", cycle, x, ((x*OUTPUT_COL + cycle) + 1) * 2 * DATA_WIDTH - 1, @@ -131,56 +119,16 @@ module pe_array #( //x, delayed_east[x][PE_ROWS - x - 1] // Third argument for the third %0d ); - */ + // end - end + end cycle = cycle + 1; - end else if (mode == 2'b00 && (valid[OUTPUT_COL*(OUTPUT_ROW)-(((OUTPUT_COL>OUTPUT_ROW)?((OUTPUT_ROW/2)):((OUTPUT_COL/2))))] == 1) && (cycle < ((OUTPUT_COL > OUTPUT_ROW)?OUTPUT_ROW:OUTPUT_COL))) begin // output-stationary for HDPE array module - - if (OUTPUT_COL > OUTPUT_ROW) begin - for (x = 0; x < PE_COLS; x = x + 1) begin - // Now filling for a particular cycle and column - acc_outputs[((x ) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] = - { {(DATA_WIDTH){1'b0}}, delayed_south[x][PE_COLS-x-1] };//cycle, PE_COLS-x-1 - // Display what's being assigned - ///* - $display("o/p_Cycle: %0d, Col: %0d, Index: %0d to %0d, delayed_south(%0d,%0d):= %0h", - cycle, - x, - ((x ) + 1) * DATA_WIDTH - 1, - ((x ) + 1) * DATA_WIDTH - DATA_WIDTH, - x, // First argument for the first %0d - (PE_COLS - x - 1), // Second argument for the second %0d - //x, - delayed_south[x][PE_COLS - x - 1] // Third argument for the third %0d - ); - //*/ - end - cycle = cycle + 1; - end else begin - for (x = 0; x < PE_ROWS; x = x + 1) begin - // Now filling for a particular cycle and column - acc_outputs[((x ) + 1)*DATA_WIDTH -1 -: DATA_WIDTH] = - { {(DATA_WIDTH){1'b0}}, delayed_east[x][PE_ROWS-x-1] };//cycle, PE_COLS-x-1 - // Display what's being assigned - // /* - $display("o/p_Cycle: %0d, Row: %0d, Index: %0d to %0d, delayed_east(%0d,%0d):= %0h", - cycle, - x, - ((x) + 1) * DATA_WIDTH - 1, - ((x ) + 1) * DATA_WIDTH - DATA_WIDTH, - x, // First argument for the first %0d - (PE_ROWS - x - 1), // Second argument for the second %0d - //x, - delayed_east[x][PE_ROWS - x - 1] // Third argument for the third %0d - ); - // */ - end - cycle = cycle + 1; - end - end + end + + end end + genvar i, j; generate for (i = 0; i < PE_ROWS; i = i + 1) begin : row_gen //ROW @@ -195,33 +143,31 @@ module pe_array #( .rst(rst), .initialization(initialization), .enable((j==0) && (i==0) ? enable : enable_var[i][j]), - .output_enable((j==0) && (i==0) ? output_enable : output_enable_var[i][j]), .clear_acc(1'b0), // No accumulator clearing .data_in_north((i == 0) ? (initialization ? north_inputs[(j+1)*DATA_WIDTH-1 -: DATA_WIDTH] : north_pipe[j][j]) : south_array[i-1][j]), .data_in_west((j == 0) ? (initialization ? west_inputs[(i+1)*DATA_WIDTH-1 -: DATA_WIDTH] : west_pipe[i][i]) : east_array[i][j-1]), - .pe_row_postion(i[$clog2(OUTPUT_ROW+1)-1:0]), - .pe_col_postion(j[$clog2(OUTPUT_COL+1)-1:0]), - .output_enable_south(output_enable_var[i+1][j]), - .output_enable_east(output_enable_var[i][j+1]), - .enable_south(enable_var[i+1][j]), - .enable_east(enable_var[i][j+1]), + // .data_in_north((i == 0) ? north_pipe[j][j] : south_array[i-1][j]), + // .data_in_west((j == 0) ? west_pipe[i][i] : east_array[i][j-1]), .data_out_south(south_array[i][j]), .data_out_east(east_array[i][j]), + .enable_south(enable_var[i+1][j]), + .enable_east(enable_var[i][j+1]), .mode(mode), .acc_out(output_array[i][j]), .valid(valid[(i*PE_COLS) + j]) ); + /////// assign acc_outputs[((i*PE_COLS+j)+1)*2*DATA_WIDTH-1 -: 2*DATA_WIDTH] = output_array[i][j]; end end endgenerate -///* +/* //redundent logic for debuging simplicity genvar k; generate @@ -232,7 +178,19 @@ module pe_array #( assign east_array_end[k] = east_array[k][PE_COLS-1]; // Rightmost column end endgenerate -//*/ +*/ + + + // === Accumulator Output Assignment with Conditional Capture === + always @(*) begin //at rising vedge of clock!!! + + if (mode == 2'b00) begin + for (x = 0; x < PE_ROWS; x = x + 1) + for (y = 0; y < PE_COLS; y = y + 1) + acc_outputs[((x*OUTPUT_COL + y)+1)*2*DATA_WIDTH-1 -: 2*DATA_WIDTH] = output_array[x][y]; + end + end + endmodule diff --git a/PE_array_tb.v b/PE_array_tb.v index 070d7ea..725d2eb 100644 --- a/PE_array_tb.v +++ b/PE_array_tb.v @@ -11,12 +11,15 @@ module pe_array_tb; reg rst; reg [PE_COLS*DATA_WIDTH-1:0] north_inputs; //PE_COLS + // reg [PE_COLS*DATA_WIDTH-1:0] north_inputs; //PE_COLS reg [PE_ROWS*DATA_WIDTH-1:0] west_inputs; //PE_ROWS reg enable; // Enable signal for PE 0,0 + // wire [PE_ROWS*PE_COLS*2*DATA_WIDTH-1:0] acc_outputs; // Accumulated outputs from all PEs wire [OUTPUT_COL*OUTPUT_ROW*2*DATA_WIDTH-1:0] acc_outputs; // Accumulated outputs from all PEs +// wire [PE_ROWS*PE_COLS-1:0] valid; // Valid signal for each PE wire [OUTPUT_COL*OUTPUT_ROW-1:0] valid; // Valid signal for each PE - reg initialization, output_enable; + reg initialization; integer i, j; integer cycle_count; @@ -26,7 +29,7 @@ module pe_array_tb; // Define your input matrices reg [DATA_WIDTH-1:0] matrix_A [0:OUTPUT_ROW-1][0:COMMON_ROW_COL-1]; // 4x4 matrix reg [DATA_WIDTH-1:0] matrix_B [0:COMMON_ROW_COL-1][0:OUTPUT_COL-1]; // 4x5 matrix - reg [DATA_WIDTH-1:0] expected_C [0:OUTPUT_ROW-1][0:OUTPUT_COL-1]; // 4x5 output matrix (for checking) + reg [DATA_WIDTH-1:0] expected_C [0:PE_ROWS-1][0:PE_COLS-1]; // 4x5 output matrix (for checking) pe_array #( .DATA_WIDTH(DATA_WIDTH), @@ -43,7 +46,6 @@ module pe_array_tb; .mode(mode), .initialization(initialization), .enable(enable), - .output_enable(output_enable), .valid(valid), .acc_outputs(acc_outputs) ); @@ -60,7 +62,6 @@ module pe_array_tb; west_inputs = 0; enable = 0; // Initially, disable all PEs initialization = 0; - output_enable = 0; cycle_count = 0; #20; rst = 0; @@ -91,10 +92,10 @@ module pe_array_tb; // Clear accumulators (after reset) #10; // Compute expected output C = A * B - for (i = 0; i < OUTPUT_ROW; i = i + 1) begin - for (j = 0; j < OUTPUT_COL; j = j + 1) begin + for (i = 0; i < PE_ROWS; i = i + 1) begin + for (j = 0; j < PE_COLS; j = j + 1) begin expected_C[i][j] = 0; - for (p = 0; p < OUTPUT_ROW; p = p + 1) begin // This should be based on PE_ROWS + for (p = 0; p < PE_ROWS; p = p + 1) begin // This should be based on PE_ROWS expected_C[i][j] = expected_C[i][j] + matrix_A[i][p] * matrix_B[p][j]; #20; // or #40 based on PE array timing @@ -155,7 +156,6 @@ module pe_array_tb; */ - /* //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // enable = 1;///////////////////////// @@ -205,10 +205,9 @@ module pe_array_tb; #10; // adjust based on PE array latency end */ -///* ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//Output stationary - for (cycle_count = 0; cycle_count < (PE_ROWS + PE_COLS +4); cycle_count = cycle_count + 1) begin//-1 + + for (cycle_count = 0; cycle_count < (PE_ROWS + PE_COLS +2); cycle_count = cycle_count + 1) begin//-1 enable = 1; north_inputs = 0; west_inputs = 0; @@ -238,10 +237,6 @@ module pe_array_tb; $display(" NORTH: PE(0,%0d): B[%0d][%0d] = %0h", j, cycle_count, j, matrix_B[cycle_count][j]); end end - if(cycle_count >= COMMON_ROW_COL) begin - output_enable <= 1; - $display("output_enable = %h", output_enable); - end $display("west_inputs = %h", west_inputs); $display("north_inputs = %h", north_inputs); @@ -250,10 +245,44 @@ module pe_array_tb; #10; // adjust based on PE array latency end -//*/ + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - + // for (cycle_count = 0; cycle_count < (PE_ROWS + PE_COLS -1); cycle_count = cycle_count + 1) begin + + // north_inputs = 0; + // west_inputs = 0; + // //enable = 0; + + // $display("\n==================Cycle %0d feeding start===================:", cycle_count); + + // for (i = 0; i < PE_ROWS; i = i + 1) begin + // for (j = 0; j < PE_COLS; j = j + 1) begin + // if (i + j == cycle_count) begin + // enable[i*PE_COLS + j] = 1'b1; + + // if (i < PE_ROWS && (cycle_count - i) < COMMON_ROW_COL && (cycle_count - i) >= 0) begin + // west_inputs[(i+1)*DATA_WIDTH-1 -: DATA_WIDTH] = matrix_A[i][cycle_count - i]; + // end + + // if (j < PE_COLS && (cycle_count - j) < COMMON_ROW_COL && (cycle_count - j) >= 0) begin + // north_inputs[(j+1)*DATA_WIDTH-1 -: DATA_WIDTH] = matrix_B[cycle_count - j][j]; + // end + + // $display(" PE(%0d,%0d): A[%0d][%0d]=%0h, B[%0d][%0d]=%0h", i, j, i, cycle_count-i, matrix_A[i][cycle_count-i], cycle_count-j, j, matrix_B[cycle_count-j][j]); + // end + // end + // end + + // $display("west_inputs = %h ; north_inputs = %h", west_inputs, north_inputs); + // $display("==================Cycle %0d feeding end===================:", cycle_count); + + // #10; // or #40 based on PE array timing + // end + + + + // Wait for operations to complete #100; @@ -273,8 +302,8 @@ module pe_array_tb; // Display expected result matrix $display("\nExpected matrix:"); - for (i = 0; i < OUTPUT_ROW; i = i + 1) begin - for (j = 0; j < OUTPUT_COL; j = j + 1) begin + for (i = 0; i < PE_ROWS; i = i + 1) begin + for (j = 0; j < PE_COLS; j = j + 1) begin $write("%0d ", expected_C[i][j]); end $display(); diff --git a/hdpe_unit_new.v b/hdpe_unit_new.v index 1e02447..db2234a 100644 --- a/hdpe_unit_new.v +++ b/hdpe_unit_new.v @@ -1,7 +1,5 @@ module matrix_multiplication_unit_new #( parameter DATA_WIDTH = 16, - parameter MEM_ROWS = 20,//20 ->5bits //16 - parameter MEM_COLS = 80,//80 ->7bits //32SS parameter PE_ROWS = 16, parameter PE_COLS = 32, parameter COMMON_ROW_COL = 4, @@ -21,35 +19,32 @@ module matrix_multiplication_unit_new #( input valid_mem_input_A, input valid_mem_input_B, - input [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_A, - input [$clog2(MEM_COLS)-1:0] cols_start_add_reading_A, - input [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_B, - input [$clog2(MEM_COLS)-1:0] cols_start_add_reading_B, - input [$clog2(MEM_ROWS)-1:0] rows_start_add_writing, - input [$clog2(MEM_COLS)-1:0] cols_start_add_writing, + input [$clog2(PE_ROWS)-1:0] rows_start_add_reading_A, + input [$clog2(PE_COLS)-1:0] cols_start_add_reading_A, + input [$clog2(PE_ROWS)-1:0] rows_start_add_reading_B, + input [$clog2(PE_COLS)-1:0] cols_start_add_reading_B, + input [$clog2(PE_ROWS)-1:0] rows_start_add_writing, + input [$clog2(PE_COLS)-1:0] cols_start_add_writing, - input [$clog2(MEM_ROWS)-1:0] rows_size_reading_A, - input [$clog2(MEM_COLS)-1:0] cols_size_reading_A, - input [$clog2(MEM_ROWS)-1:0] rows_size_reading_B, - input [$clog2(MEM_COLS)-1:0] cols_size_reading_B, - + input [$clog2(PE_ROWS)-1:0] rows_size_reading_A, + input [$clog2(PE_COLS)-1:0] cols_size_reading_A, + input [$clog2(PE_ROWS)-1:0] rows_size_reading_B, + input [$clog2(PE_COLS)-1:0] cols_size_reading_B, output reg done, output reg read_full_row_A, output reg read_full_row_B, - output reg write_full_row_out, - output reg [$clog2(MEM_ROWS)-1:0] row_addr_A, - output reg [$clog2(MEM_COLS)-1:0] col_addr_A, - output reg [$clog2(MEM_ROWS)-1:0] row_addr_B, - output reg [$clog2(MEM_COLS)-1:0] col_addr_B, - output reg [$clog2(MEM_ROWS)-1:0] row_addr_out, - output reg [$clog2(MEM_COLS)-1:0] col_addr_out, + output reg [$clog2(PE_ROWS)-1:0] row_addr_A, + output reg [$clog2(PE_COLS)-1:0] col_addr_A, + output reg [$clog2(PE_ROWS)-1:0] row_addr_B, + output reg [$clog2(PE_COLS)-1:0] col_addr_B, + output reg [$clog2(PE_ROWS)-1:0] row_addr_out, + output reg [$clog2(PE_COLS)-1:0] col_addr_out, output reg read_enable_A, output reg read_enable_B, output reg write_enable_out, - output reg [OUTPUT_ROW*DATA_WIDTH-1:0] Full_row_out, output reg [DATA_WIDTH-1:0] data_out ); @@ -62,9 +57,7 @@ parameter IDLE = 0, COMPUTE = 5, WRITE = 6, DONE = 7, - BEFORE_COMPUTE = 8, - LOAD_ROW_A_COL_B = 9, - WAIT_VALID_ROW_A_COL_B = 10; + BEFORE_COMPUTE = 8; reg [3:0] current_state, next_state; @@ -73,7 +66,7 @@ reg enable_pe_array; wire [OUTPUT_ROW*OUTPUT_COL-1:0] valid_pe_array; reg [DATA_WIDTH*OUTPUT_COL-1:0] north_inputs; reg [DATA_WIDTH*PE_ROWS-1:0] west_inputs; -wire [OUTPUT_ROW*DATA_WIDTH-1:0] acc_outputs; +wire [OUTPUT_ROW*OUTPUT_COL*2*DATA_WIDTH-1:0] acc_outputs; reg [9:0] compute_counter; reg [9:0] write_counter, read_counter; @@ -81,7 +74,6 @@ reg [9:0] write_counter, read_counter; wire [2*DATA_WIDTH-1:0] selected_accum_value; reg initialization; -reg output_enable; // Instantiate PE Array @@ -99,12 +91,13 @@ pe_array #( .initialization(initialization), .north_inputs(north_inputs), .west_inputs(west_inputs), - .output_enable(output_enable), .enable(enable_pe_array), .valid(valid_pe_array), .acc_outputs(acc_outputs) ); +// Assign selected output slice for writing +assign selected_accum_value = acc_outputs[(write_counter+1)*2*DATA_WIDTH-1 -: 2*DATA_WIDTH]; // FSM Sequential always @(posedge clk or posedge rst) begin @@ -116,7 +109,6 @@ end // FSM Next-State Logic always @(*) begin - case (current_state) IDLE: next_state = enable ? LOAD_ROW_A : IDLE; LOAD_ROW_A: next_state = WAIT_VALID_ROW_A; @@ -124,13 +116,12 @@ always @(*) begin LOAD_COL_B: next_state = WAIT_VALID_COL_B; WAIT_VALID_COL_B: next_state = valid_mem_input_B ? BEFORE_COMPUTE : WAIT_VALID_COL_B;//COMPUTE BEFORE_COMPUTE : next_state = COMPUTE; - COMPUTE: next_state = (read_counter < COMMON_ROW_COL) ? LOAD_ROW_A :(compute_counter >= PE_ROWS + PE_COLS + COMMON_ROW_COL+1) ? WRITE : COMPUTE; // PE_ROWS+PE_COLS+4 - WRITE: next_state = (write_counter >= ((OUTPUT_COL>OUTPUT_ROW)?OUTPUT_ROW-1:OUTPUT_COL-1)) ? DONE : WRITE; + COMPUTE: next_state = (read_counter < COMMON_ROW_COL) ? LOAD_ROW_A :(compute_counter >= PE_ROWS + OUTPUT_COL - 1) ? WRITE : COMPUTE; // + // COMPUTE: next_state = (compute_counter >= PE_ROWS + OUTPUT_COL - 1) ? WRITE : COMPUTE; // + WRITE: next_state = (write_counter >= (PE_ROWS * OUTPUT_COL)) ? DONE : WRITE; DONE: next_state = DONE; default: next_state = IDLE; endcase - - end // FSM Outputs @@ -145,9 +136,6 @@ always @(posedge clk or posedge rst) begin write_enable_out <= 0; read_counter <= 0; initialization <= 0; - output_enable <= 0; - write_full_row_out <= 0; - Full_row_out <= 0; end else begin case (current_state) IDLE: begin @@ -159,8 +147,6 @@ always @(posedge clk or posedge rst) begin done <= 0; write_enable_out <= 0; read_counter <= 0; - write_full_row_out <= 0; - Full_row_out <= 0; $display("[IDLE] Waiting for enable..."); end @@ -168,7 +154,6 @@ always @(posedge clk or posedge rst) begin row_addr_A <= rows_start_add_reading_A; col_addr_A <= cols_start_add_reading_A + read_counter;// read_full_row_A <= 1; - enable_pe_array <= 0; $display("[LOAD_ROW_A] Reading full row A."); end @@ -193,7 +178,6 @@ always @(posedge clk or posedge rst) begin read_counter <= read_counter + 1; end end - BEFORE_COMPUTE: begin north_inputs <= full_row_B[OUTPUT_COL*DATA_WIDTH-1:0]; west_inputs <= full_row_A; @@ -206,24 +190,21 @@ always @(posedge clk or posedge rst) begin enable_pe_array <= 1; compute_counter <= compute_counter + 1; $display("[COMPUTE] Cycle %0d / %0d", compute_counter, PE_ROWS+OUTPUT_COL-1); - if(compute_counter >= COMMON_ROW_COL) begin - output_enable <= 1; - $display("output_enable = %h", output_enable); - end end WRITE: begin enable_pe_array <= 1; + write_enable_out <= valid_pe_array[write_counter]; - row_addr_out <= (write_counter ) + rows_start_add_writing; - col_addr_out <= cols_start_add_writing;///// + row_addr_out <= (write_counter / OUTPUT_COL) + rows_start_add_writing; + col_addr_out <= (write_counter % OUTPUT_COL) + cols_start_add_writing;///// - - Full_row_out <= acc_outputs;//// - write_full_row_out <= 1; + // row_addr_out <= (write_counter / OUTPUT_COL) + rows_start_add_writing; + // col_addr_out <= (write_counter % OUTPUT_COL) + cols_start_add_writing;///// + data_out <= selected_accum_value[DATA_WIDTH-1:0]; $display("[WRITE] Writing output[%0d][%0d] = %0h | Valid = %b", - (write_counter), 0, acc_outputs, valid_pe_array[write_counter]); + (write_counter / OUTPUT_COL), (write_counter % OUTPUT_COL), data_out, valid_pe_array[write_counter]); // $display("[WRITE] Writing output[%0d][%0d] = %0h | Valid = %b", // row_addr_out, col_addr_out, data_out, valid_pe_array[write_counter]); @@ -234,7 +215,6 @@ always @(posedge clk or posedge rst) begin done <= 1; enable_pe_array <= 0; compute_counter <= 0; - write_full_row_out <= 0; $display("[DONE] Matrix multiplication completed."); end diff --git a/hdpe_unit_tb.v b/hdpe_unit_tb.v index 9c470fa..bc0c793 100644 --- a/hdpe_unit_tb.v +++ b/hdpe_unit_tb.v @@ -4,10 +4,10 @@ module tb_matrix_multiplication; // Parameters parameter DATA_WIDTH = 16; - parameter MEM_ROWS = 20;//20 ->5bits //16 - parameter MEM_COLS = 80;//80 ->7bits //32SS - parameter PE_ROWS = 20;// - parameter PE_COLS = 10;// + parameter ROWS = 20;//20 ->5bits //16 + parameter COLS = 80;//80 ->7bits //32SS + /// parameter COLS_USED = 4;//3 // this given to rows_size_reading_B & cols_size_reading_A + // parameter OUTPUT_COLS = 10; parameter COMMON_ROW_COL = 4; parameter OUTPUT_COL = 10; parameter OUTPUT_ROW = 20; @@ -26,57 +26,57 @@ module tb_matrix_multiplication; reg valid_mem_input_B; // Address offset configuration - reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_A; - reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_A; - reg [$clog2(MEM_ROWS)-1:0] rows_start_add_reading_B; - reg [$clog2(MEM_COLS)-1:0] cols_start_add_reading_B; - reg [$clog2(MEM_ROWS)-1:0] rows_start_add_writing; - reg [$clog2(MEM_COLS)-1:0] cols_start_add_writing; + reg [$clog2(ROWS)-1:0] rows_start_add_reading_A; + reg [$clog2(COLS)-1:0] cols_start_add_reading_A; + reg [$clog2(ROWS)-1:0] rows_start_add_reading_B; + reg [$clog2(COLS)-1:0] cols_start_add_reading_B; + reg [$clog2(ROWS)-1:0] rows_start_add_writing; + reg [$clog2(COLS)-1:0] cols_start_add_writing; // Size configuration - reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_A; - reg [$clog2(MEM_COLS)-1:0] cols_size_reading_A; - reg [$clog2(MEM_ROWS)-1:0] rows_size_reading_B; - reg [$clog2(MEM_COLS)-1:0] cols_size_reading_B; + reg [$clog2(ROWS)-1:0] rows_size_reading_A; + reg [$clog2(COLS)-1:0] cols_size_reading_A; + reg [$clog2(ROWS)-1:0] rows_size_reading_B; + reg [$clog2(COLS)-1:0] cols_size_reading_B; // Outputs wire done; - wire [$clog2(MEM_ROWS)-1:0] row_addr_A; - wire [$clog2(MEM_COLS)-1:0] col_addr_A; - wire [$clog2(MEM_ROWS)-1:0] row_addr_B; - wire [$clog2(MEM_COLS)-1:0] col_addr_B; - wire [$clog2(MEM_ROWS)-1:0] row_addr_out; - wire [$clog2(MEM_COLS)-1:0] col_addr_out; + wire [$clog2(ROWS)-1:0] row_addr_A; + wire [$clog2(COLS)-1:0] col_addr_A; + wire [$clog2(ROWS)-1:0] row_addr_B; + wire [$clog2(COLS)-1:0] col_addr_B; + wire [$clog2(ROWS)-1:0] row_addr_out; + wire [$clog2(COLS)-1:0] col_addr_out; wire read_enable_A; wire read_enable_B; wire write_enable_out; - wire read_full_row_A, read_full_row_B,write_full_row_1, write_full_row_2, write_full_row_3; + wire read_full_row_A, read_full_row_B; reg read_full_row_3; wire [DATA_WIDTH-1:0] data_out; // Unused memory interfaces - reg [$clog2(MEM_ROWS)-1:0] row_addr_1, row_addr_2, row_addr_3; //reg - reg [$clog2(MEM_COLS)-1:0] col_addr_1, col_addr_2, col_addr_3; + reg [$clog2(ROWS)-1:0] row_addr_1, row_addr_2, row_addr_3; //reg + reg [$clog2(COLS)-1:0] col_addr_1, col_addr_2, col_addr_3; wire [DATA_WIDTH-1:0] data_out_1, data_out_2, data_in_3; reg [DATA_WIDTH-1:0] data_out_3; wire valid_1, valid_2, valid_3; reg write_enable_3, read_enable_3;//reg is instead of wire so that multiple inputs can be driven through them. // wire [7:0] no_rows1, no_cols1; - wire [DATA_WIDTH*PE_ROWS-1:0] full_row_output_1, full_row_output_2, full_row_output_3, full_row_input_1, full_row_input_2, full_row_input_3; /// - reg [DATA_WIDTH*PE_ROWS-1:0]full_row_A, full_row_B;//reg + wire [DATA_WIDTH*ROWS-1:0] full_row_output_1, full_row_output_2, full_row_output_3; /// + reg [DATA_WIDTH*ROWS-1:0]full_row_A, full_row_B;//reg integer cycle_count; // Instantiate the memory module top_module_mem #( - .ROWS1(MEM_ROWS), - .COLS1(MEM_COLS), - .ROWS2(MEM_ROWS), - .COLS2(MEM_COLS), - .ROWS3(MEM_ROWS), - .COLS3(MEM_COLS), + .ROWS1(ROWS), + .COLS1(COLS), + .ROWS2(ROWS), + .COLS2(COLS), + .ROWS3(ROWS), + .COLS3(COLS), .DATA_WIDTH(DATA_WIDTH), .COLS_USED(COMMON_ROW_COL) ) memory_inst ( @@ -91,9 +91,7 @@ module tb_matrix_multiplication; .data_output_1(data_out_1), .valid_1(valid_1), .read_full_row_1(1'b0), - .full_row_output_1(full_row_output_1), - .full_row_input_1(full_row_input_1),//// - .write_full_row_1(write_full_row_1),/// + .full_row_output_1(full_row_output_1), // Memory 2 (not used) .row_addr_2(row_addr_2), @@ -105,8 +103,6 @@ module tb_matrix_multiplication; .valid_2(valid_2), .read_full_row_2(1'b0), .full_row_output_2(full_row_output_2), - .full_row_input_2(full_row_input_2),//// - .write_full_row_2(write_full_row_2),/// // Memory 3 (used for matrix multiplication) .row_addr_3(row_addr_3), @@ -117,20 +113,18 @@ module tb_matrix_multiplication; .data_output_3(data_in_3), .valid_3(valid_3), .read_full_row_3(read_full_row_3), - .full_row_output_3(full_row_output_3), - .full_row_input_3(full_row_input_3),//// - .write_full_row_3(write_full_row_3)/// + .full_row_output_3(full_row_output_3) ); // Instantiate the matrix multiplication unit +/// matrix_multiplication_unit #( matrix_multiplication_unit_new #( +/////matrix_multiplication_unit_new_v1 #( .DATA_WIDTH(DATA_WIDTH), - .MEM_ROWS(MEM_ROWS), - .MEM_COLS(MEM_COLS), - .PE_ROWS(PE_ROWS), - .PE_COLS(PE_COLS), + .PE_ROWS(ROWS), + .PE_COLS(COLS), .COMMON_ROW_COL(COMMON_ROW_COL), - // .OUTPUT_MEM_COLS(OUTPUT_MEM_COLS), + // .OUTPUT_COLS(OUTPUT_COLS), .OUTPUT_COL(OUTPUT_COL), .OUTPUT_ROW(OUTPUT_ROW) ) mmu_inst ( @@ -173,9 +167,7 @@ module tb_matrix_multiplication; .full_row_A(full_row_A), .full_row_B(full_row_B), .read_full_row_A(read_full_row_A), - .read_full_row_B(read_full_row_B), - .write_full_row_out(write_full_row_3), - .Full_row_out(full_row_input_3) + .read_full_row_B(read_full_row_B) ); // Clock Generation @@ -205,6 +197,8 @@ module tb_matrix_multiplication; full_row_B = full_row_output_3;//<= changed to = ///inputs to hdpe + // full_row_A <= full_row_output_3;//<= + // full_row_B <= full_row_output_3; valid_mem_input_A = valid_3; valid_mem_input_B = valid_3; @@ -227,6 +221,8 @@ module tb_matrix_multiplication; // Initialize control signals rst = 1; enable = 0; + //mode = 2'b01; // input-Stationary mode + // mode = 2'b10; // weight-Stationary mode mode = 2'b00; // Output-Stationary mode // Wait for a few clock cycles @@ -237,7 +233,10 @@ module tb_matrix_multiplication; #20; //assigning register properly //inputs to hdpe - + // data_input_A <= data_in_3; + // data_input_B <= data_in_3; + // valid_mem_input_A <= valid_3; + // valid_mem_input_B <= valid_3; rows_start_add_reading_A <= 5'b0; cols_start_add_reading_A <= 7'b0; rows_start_add_reading_B <= 5'b0; @@ -250,7 +249,17 @@ module tb_matrix_multiplication; rows_size_reading_B <= COMMON_ROW_COL; cols_size_reading_B <= 7'd9; //(B-> 10X4)^T //outputs to hdpe - + // row_addr_3 <= row_addr_A; + // col_addr_3 <= col_addr_A; + // row_addr_3 <= row_addr_B; + // col_addr_3 <= col_addr_B; + // row_addr_3 <= row_addr_out; + // col_addr_3 <= row_addr_out; + // read_enable_3 <= read_enable_A; + // read_enable_3 <= read_enable_B; + // write_enable_3 <= write_enable_out; + // data_in_3 <= data_out; + // Enable the matrix multiplication enable = 1; @@ -261,7 +270,7 @@ module tb_matrix_multiplication; wait (done); // Print number of cycles taken - $display("Operation completed in %0d cycles", cycle_count); //261 cycles-> old o/p sationary implemenetaion + $display("Operation completed in %0d cycles", cycle_count); //1261 cycles-> old o/p sationary implemenetaion // Disable the enable signal diff --git a/single_PE_module.v b/single_PE_module.v index c75dd8d..0509f4c 100644 --- a/single_PE_module.v +++ b/single_PE_module.v @@ -9,9 +9,6 @@ module processing_element #( input enable, input [1:0] mode,// 00: Output-Stationary, 01: Input-Stationary, 10: Weight-Stationary input initialization, - input output_enable, - input [$clog2(OUTPUT_ROW+1)-1:0]pe_row_postion, - input [$clog2(OUTPUT_COL+1)-1:0]pe_col_postion, // Inputs from the north and west input [DATA_WIDTH-1:0] data_in_north, // A matrix element @@ -24,9 +21,6 @@ module processing_element #( output reg enable_south, output reg enable_east, - output reg output_enable_south, - output reg output_enable_east, - // Accumulated output output reg [2*DATA_WIDTH-1:0] acc_out, output reg valid, @@ -36,7 +30,7 @@ module processing_element #( ); reg [2*DATA_WIDTH-1:0] acc; -reg [$clog2(COMMON_ROW_COL+1):0] count_acc;//-1 +reg [$clog2(COMMON_ROW_COL+1)-1:0] count_acc; reg [$clog2(OUTPUT_COL+1)-1:0] count_col; reg [$clog2(OUTPUT_ROW+1)-1:0] count_row; reg [DATA_WIDTH-1:0] data_in_west_reg, data_in_north_reg; @@ -56,8 +50,6 @@ always @(posedge clk or posedge rst) begin data_in_north_reg <= 0; enable_south <= 0; enable_east <= 0; - output_enable_south <= 0; - output_enable_east <= 0; end else begin if (clear_acc) begin acc <= 0; @@ -67,10 +59,7 @@ always @(posedge clk or posedge rst) begin count_row <= 0; data_in_west_reg <= 0; data_in_north_reg <= 0; - enable_south <= 0; - enable_east <= 0; - output_enable_south <= 0; - output_enable_east <= 0; + // end else if ( (enable == 1) && (count_acc < COMMON_ROW_COL) ) begin/// end else begin case(mode) 2'b00: begin//output staionary @@ -79,47 +68,25 @@ always @(posedge clk or posedge rst) begin acc <= acc + data_in_north * data_in_west; count_acc <= count_acc + 1; - // $display("north =%0h , west = %0h, count_acc %d, acc =%0h ", data_in_north, data_in_west, count_acc, acc);/////////////////// - // Forward the inputs - data_out_south <= data_in_north; - data_out_east <= data_in_west; - enable_south <= enable; - enable_east <= enable; - - end else begin - enable_south <= enable;///// - enable_east <= enable;///// - end + // $display("north =%0h , west = %0h, count_acc %d, acc =%0h ", data_in_north, data_in_west, count_acc, acc);/////////////////// + end + // Display all the port values - if ( (enable == 1) && (count_acc >= COMMON_ROW_COL-1)) begin + // Forward the inputs + data_out_south <= data_in_north; + data_out_east <= data_in_west; + enable_south <= enable; + enable_east <= enable; + if(count_acc == COMMON_ROW_COL) begin ///COMMON_ROW_COL-1 + // if(count_acc > COMMON_ROW_COL) begin ///COMMON_ROW_COL-1 //> - if (OUTPUT_COL > OUTPUT_ROW) begin - data_out_south <= (output_enable == 1)? acc: data_in_north; - if (count_acc == (COMMON_ROW_COL+pe_row_postion+1)) begin //-1 reason fo r the XXX - output_enable_south <= output_enable; - end - if((pe_row_postion == 0) && (count_acc == (COMMON_ROW_COL+pe_row_postion+1)))begin - output_enable_east <= output_enable; - end else if (count_acc == (COMMON_ROW_COL+pe_row_postion)) begin - output_enable_east <= output_enable; - end - end else begin - data_out_east <= (output_enable == 1)? acc: data_in_west; - if (count_acc == (COMMON_ROW_COL+pe_col_postion+1)) begin //-1 - output_enable_east <= output_enable; - end - if( (pe_col_postion == 0) && (count_acc == (COMMON_ROW_COL+pe_col_postion+1))) begin - output_enable_south <= output_enable; - end else if (count_acc == (COMMON_ROW_COL+pe_col_postion)) begin - output_enable_south <= output_enable; - end - end - if ( output_enable == 1)begin///// - valid <= 1; - end - count_acc <= count_acc + 1; + acc_out <= acc; + valid <= 1; + count_acc <= count_acc + 1; end + // $display("Time: %t, Data In North: %d, Data In West: %d, Data Out South: %d, Data Out East: %d, Accumulator: %d, Acc Out: %d", + // $time, data_in_north, data_in_west, data_out_south, data_out_east, acc, acc_out); end 2'b01:begin //Input-Stationary @@ -134,11 +101,13 @@ always @(posedge clk or posedge rst) begin count_col <= count_col + 1; // $display("north =%0h , west = %0h, count_col %d, data_out_east =%0h ", data_in_north, data_in_west, count_col, data_out_east);/////////////////// + // end // Display all the port values // Forward the inputs data_out_south <= data_in_north; + //data_out_east <= acc[DATA_WIDTH-1:0]; enable_south <= enable; enable_east <= enable; end @@ -166,6 +135,7 @@ always @(posedge clk or posedge rst) begin // Display all the port values // Forward the inputs + //data_out_south <= acc[DATA_WIDTH-1:0]; data_out_east <= data_in_west; enable_south <= enable; enable_east <= enable;