library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use work.real_vector_pkg.all;  -- defines my_real, my_real_vector, my_real_matrix, etc.

entity tile_unit is
  generic (
    N  : integer := 2;  -- Size of sub-vectors processed by each DPTC core
    Nv : integer := 2;  -- Rows in chunk from M1
    Nh : integer := 2;  -- Columns in chunk from M2
    Nm : integer := 4;  -- Common dimension (columns of M1 / rows of M2)
    Nd : integer := 2   -- Number of DPTC cores
  );
  port (
    clk        : in std_logic;
    reset_n    : in std_logic;
    enable     : in std_logic;
    
    m1_chunk   : in my_real_matrix(0 to Nv-1, 0 to Nm-1); -- Nv × Nm sub-matrix of M1
    m2_chunk   : in my_real_matrix(0 to Nh-1, 0 to Nm-1); -- Nh × Nm sub-matrix of M2 (transposed view)

    out_valid  : out std_logic;                           -- Result ready signal
    result_out : out real_matrix(0 to Nv-1, 0 to Nh-1)    -- Output quadrant, Final Nv × Nh tile result
  );
end entity;

architecture Behavioral of tile_unit is

  component dptc
    generic (
      Nv : integer := 2;
      Nh : integer := 2;
      N  : integer := 2
    );
    port (
      clk         : in std_logic;
      reset_n     : in std_logic;
      enable      : in std_logic;
      x_matrix    : in my_real_matrix(0 to Nv-1, 0 to N-1); --takes N columns from M1 
      y_matrix    : in my_real_matrix(0 to Nh-1, 0 to N-1); -- takes N rows from M2.
      out_valid   : out std_logic;
      result_matrix : out real_matrix(0 to Nv-1, 0 to Nh-1) -- Result(i,j)+=sum(M1chunk(i,k)∗M2c​hunk(j,k)) for k in tile
    );
  end component;

  -- Internals
  type matrix_array is array(0 to Nd-1) of real_matrix(0 to Nv-1, 0 to Nh-1);
  signal dptc_outputs : matrix_array;                 -- stores Nd partial results from each DPTC
  signal valid_signals : std_logic_vector(0 to Nd-1); -- stores Nd valid flags from each DPTC

  -- Sliced inputs for each DPTC (Each core gets N columns of M1 and N columns of M2)
  type chunk_matrix_array is array(0 to Nd-1) of my_real_matrix(0 to Nv-1, 0 to N-1);
  signal m1_slices : chunk_matrix_array;
  signal m2_slices : chunk_matrix_array;
  -- Internal signals
  signal sum_result_reg : real_matrix(0 to Nv-1, 0 to Nh-1); -- Final accumulated result
  signal sum_valid_reg  : std_logic := '0';

begin

  -- Slice M1 and M2 into chunks for each DPTC
  slicer_proc: process(m1_chunk, m2_chunk)
  begin
    -- M1 slicing
    for d in 0 to Nd-1 loop
      for i in 0 to Nv-1 loop
        for k in 0 to N-1 loop
          m1_slices(d)(i,k) <= m1_chunk(i, d*N + k); -- Take columns d*N ... d*N+N-1 and assign to m1_slices(d)
        end loop;
      end loop;
        -- M2 slicing
      for j in 0 to Nh-1 loop
        for k in 0 to N-1 loop
          --For each row of M2 (M2 is logically transposed here — columns treated as rows): Take columns d*N ... d*N+N-1
          m2_slices(d)(j,k) <= m2_chunk(j, d*N + k); 
        end loop;
      end loop;
    end loop;
  end process;

  -- Instantiate DPTC cores
  dptc_gen: for d in 0 to Nd-1 generate -- Instantiates Nd DPTC cores
    dptc_inst : dptc
      generic map (
        Nv => Nv,
        Nh => Nh,
        N  => N
      )
      port map (
        clk           => clk,
        reset_n       => reset_n,
        enable        => enable,
        x_matrix      => m1_slices(d),
        y_matrix      => m2_slices(d),
        out_valid     => valid_signals(d),
        result_matrix => dptc_outputs(d)
      );
  end generate;

  -- Sum all DPTC outputs element-wise
 sum_proc: process(clk, reset_n)
    variable acc : real_matrix(0 to Nv-1, 0 to Nh-1);
    variable all_valid : std_logic;
  begin
    if reset_n = '0' then
      sum_result_reg <= (others => (others => ZERO_REAL));
      sum_valid_reg  <= '0';

    elsif rising_edge(clk) then
      -- Wait until all DPTC outputs are valid
      all_valid := '1';
      for d in 0 to Nd-1 loop
        if valid_signals(d) /= '1' then
          all_valid := '0';
        end if;
      end loop;

      if all_valid = '1' then
        -- Compute the sum
        for i in 0 to Nv-1 loop
          for j in 0 to Nh-1 loop
            acc(i,j) := ZERO_REAL;
          end loop;
        end loop;

        for d in 0 to Nd-1 loop
          for i in 0 to Nv-1 loop
            for j in 0 to Nh-1 loop
              acc(i,j) := acc(i,j) + dptc_outputs(d)(i,j);
            end loop;
          end loop;
        end loop;

        -- Store result to register (output happens next cycle)
        sum_result_reg <= acc;
        sum_valid_reg  <= '1';

      else
        sum_valid_reg  <= '0';
      end if;
    end if;
  end process;

  -- Outputs
  result_out <= sum_result_reg;
  out_valid  <= sum_valid_reg;


end architecture;