diff --git a/hw/beh/hu.vhd b/hw/beh/hu.vhd new file mode 100644 index 0000000..8c91373 --- /dev/null +++ b/hw/beh/hu.vhd @@ -0,0 +1,57 @@ +-- hu +-- Update H using stream of weights + +use work.pkg_sbs.all; + +entity hu is + + port ( + clk, rstn : in bit; + cfg_hu : in bit_vector(BW_HU_CFG -1 downto 0); -- Config + ena_w : in bit; -- New weight + is_ini : in bit; -- First vector (get w and h when ena) + is_fst : in bit; -- Fist component in vector + ena_ho : out bit; -- Signal a valid ho value + wi : in real; -- stream of weights + hi : in real; -- stream of state + ho : out real); -- stream of states + +end entity hu; + + + +architecture rtlf of hu is + + signal ctr_hu : bit_vector(BW_HU_CTR-1 downto 0); + signal loc_h : bit_vector(ADDR_H_MAX-1 downto 0); + signal eps : real; + +begin -- architecture rtlf + + i_hu_dp: entity work.hu_dp + port map ( + clk => clk, + rstn => rstn, + eps => eps, + ctr_hu => ctr_hu, + loc_h => loc_h, + wi => wi, + hi => hi, + ho => ho); + + + i_hu_ctr: entity work.hu_ctr + port map ( + clk => clk, + rstn => rstn, + eps => eps, + cfg_hu => cfg_hu, + loc_h => loc_h, + ena_w => ena_w, + is_ini => is_ini, + is_fst => is_fst, + ena_ho => ena_ho, + ctr_hu => ctr_hu); + + +end architecture rtlf; diff --git a/hw/beh/hu_ctr.vhd b/hw/beh/hu_ctr.vhd new file mode 100644 index 0000000..9fce6d2 --- /dev/null +++ b/hw/beh/hu_ctr.vhd @@ -0,0 +1,72 @@ +-- hu_ctr +-- Control path for Update H using stream of weights + +use work.pkg_sbs.all; + +entity hu_ctr is + port ( + clk, rstn : in bit; + cfg_hu : in bit_vector(BW_HU_CFG -1 downto 0); -- Config + ena_w : in bit; -- New weight + is_ini : in bit; -- First vector (get w and h when ena) + is_fst : in bit; -- Fist component in vector + loc_h : in bit_vector(ADDR_H_MAX-1 downto 0); -- Current location in H + ena_ho : out bit; -- Signal a valid ho value + eps : out real; + ctr_hu : out bit_vector(BW_HU_CTR-1 downto 0)); -- Control for data path + +end entity hu_ctr; + +library ieee; +use ieee.numeric_bit.all; + +architecture beh of hu_ctr is + + signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : bit; + signal ctr_addr_rst, ctr_addr_inc, ctr_write_hw : bit; + signal ctr_wr_hw, ctr_wr_hp : bit; + + -- Number of elements in H (currently fixed) + constant MAX_LOC_H : bit_vector(ADDR_H_MAX-1 downto 0) := bit_vector(to_unsigned(8, ADDR_H_MAX)); + + --constant T : time := 10 ns; + +begin -- architecture beh + + eps <= 0.2; + + ctr_hu(0) <= ctr_sel_ini ; + ctr_hu(1) <= ctr_sum_ini ; + ctr_hu(2) <= ctr_update_sum ; + ctr_hu(3) <= ctr_addr_rst ; + ctr_hu(4) <= ctr_addr_inc ; + ctr_hu(5) <= ctr_write_hw ; + ctr_hu(6) <= ctr_wr_hw ; + --ctr_hu(7) <= ctr_wr_hp ; -- ctr_wr_hp and ctr_wr_hw are the same + ctr_hu(7) <= ctr_update_sum2 ; + + -- Code in first approximation + ctr_sel_ini <= is_ini; + ctr_wr_hp <= ena_w; --is_ini; + ctr_wr_hw <= ena_w; + ctr_sum_ini <= is_fst; + ctr_update_sum <= transport is_fst after 7*T ; + --ctr_update_sum2 <= transport ctr_update_sum after T; + --ctr_update_sum <= '1' when (loc_h = MAX_LOC_H) else '0'; + + ctr_addr_rst <= ctr_update_sum; + ctr_addr_inc <= ena_w and not ctr_addr_rst; + + ena_ho <= ena_w and not is_ini; + + rg: process (clk, rstn) is + begin + if rstn = '0' then -- asynchronous reset (active low) + ctr_update_sum2 <= '0'; + elsif clk'event and clk = '1' then -- rising clock edge + ctr_update_sum2 <= ctr_update_sum; + end if; + end process rg; + + +end architecture beh; diff --git a/hw/beh/hu_dp.vhd b/hw/beh/hu_dp.vhd new file mode 100644 index 0000000..2606051 --- /dev/null +++ b/hw/beh/hu_dp.vhd @@ -0,0 +1,124 @@ +-- hu_dp +-- Data path for Update H using stream of weights + +use work.pkg_sbs.all; + +entity hu_dp is + port ( + clk, rstn : in bit; + ctr_hu : in bit_vector(BW_HU_CTR-1 downto 0); -- Control for data path + loc_h : out bit_vector(ADDR_H_MAX-1 downto 0); -- Current location in H + eps : in real; + wi : in real; -- stream of weights + hi : in real; -- stream of state + ho : out real); -- stream of states + +end entity hu_dp; + +library ieee; +use ieee.numeric_bit.all; + +architecture rtlf of hu_dp is + -- Memory + signal mem_hp : array_as_h; -- State (internal) + signal mem_hw : array_as_h; -- Copy of w*h + signal addr_wr, addr_nxt : bit_vector(ADDR_H_MAX-1 downto 0); -- Address + + -- Data path for hp (i.t. h un-normalized) and hw (hp*w) + signal hp_new, hp_new_rg, hp_p, h_eff : real := 0.0; + signal hw_p, hw_nxt : real := 0.0; + + -- Accumulators for normalization + signal sum_hw, sum_hw_nxt : real := 0.0; -- Running sum hw + signal sum_hw_p, sum_hw_p_nxt : real := 0.0; -- Saved sum hw of previous + signal sum_hp, sum_hp_nxt : real := 0.0; -- Running sum hp + signal sum_hp_p, sum_hp_p_nxt : real := 0.0; -- Saved sum hw of previous (normalization) + + -- Control signals + signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : bit; + signal ctr_addr_rst, ctr_addr_inc, ctr_write_hw : bit; + signal ctr_wr_hw, ctr_wr_hp : bit; + +begin -- architecture rtlf + + -- Get control signals + ctr_sel_ini <= ctr_hu(0); + ctr_sum_ini <= ctr_hu(1); + ctr_update_sum <= ctr_hu(2); + ctr_addr_rst <= ctr_hu(3); + ctr_addr_inc <= ctr_hu(4); + ctr_write_hw <= ctr_hu(5); + ctr_wr_hw <= ctr_hu(6); + ctr_wr_hp <= ctr_hu(6); + ctr_update_sum2 <= ctr_hu(7); + + + -- Main calculation + hp_new <= hp_p * sum_hw_p + sum_hp_p * hw_p ; + + -- Mux to select first h or saved one + h_eff <= hi when ctr_sel_ini='1' else hp_new_rg; + + -- Calculate hw + hw_nxt <= h_eff * wi ; + + -- Output h (note latency of a complete group) + ho <= h_eff; + + -- Accumulate hw and hp + sum_hw_nxt <= hw_nxt when ctr_sum_ini='1' else sum_hw + hw_nxt; + sum_hw_p_nxt <= 0.0 when ctr_update_sum='1' else + sum_hw when ctr_update_sum2='1' else sum_hw_p; + + sum_hp_nxt <= h_eff when ctr_sum_ini='1' else sum_hp + h_eff; -- Accumulate h + --sum_hp_p_nxt <= sum_hp_nxt * eps when ctr_update_sum='1' else sum_hp_p; + sum_hp_p_nxt <= eps when ctr_update_sum='1' else + hp_new when ctr_update_sum2='1' else sum_hp_p; + + + -- Read from memory + --hw_p <= mem_hw(to_integer(unsigned(addr_nxt))); + hw_p <= mem_hw(to_integer(unsigned(addr_nxt))) + when ctr_update_sum2='0' else sum_hp; -- Put sum_hp in mult + hp_p <= mem_hp(to_integer(unsigned(addr_nxt))); + + -- Address calculation + addr_nxt <= (others => '0') when ctr_addr_rst='1' else + bit_vector(unsigned(addr_wr) + 1) when ctr_addr_inc='1' else addr_wr; + loc_h <= addr_wr; -- Output for ctrl path + + -- Registers + rg: process (clk, rstn) is + begin -- process pipe1 + if rstn = '0' then + hp_new_rg <= 0.0; + sum_hw <= 0.0; + sum_hw_p <= 0.0; + sum_hp <= 0.0; + sum_hp_p <= 0.0; + addr_wr <= (others => '0'); + elsif clk'event and clk = '1' then + hp_new_rg <= hp_new; + sum_hw <= sum_hw_nxt; + sum_hw_p <= sum_hw_p_nxt; + sum_hp <= sum_hp_nxt; + sum_hp_p <= sum_hp_p_nxt; + addr_wr <= addr_nxt; + end if; + end process rg; + + + -- Memory + mem: process (clk) is + begin -- process mem + if clk'event and clk = '1' then -- rising clock edge + if ctr_wr_hw='1' then + mem_hw(to_integer(unsigned(addr_wr))) <= hw_nxt; + end if; + if ctr_wr_hp='1' then + mem_hp(to_integer(unsigned(addr_wr))) <= h_eff; + end if; + end if; + end process mem; + +end architecture rtlf; diff --git a/hw/beh/mem_sync.vhd b/hw/beh/mem_sync.vhd new file mode 100644 index 0000000..a7c022b --- /dev/null +++ b/hw/beh/mem_sync.vhd @@ -0,0 +1,49 @@ +-- Implementation of a synchronous single port memory + +library ieee; +use ieee.numeric_bit.all; + + +entity mem_sync is + generic( + BA : natural := 7); -- log2 addresses + port( + clk : in bit; + wr, rd : in bit; + addr : in bit_vector(BA-1 downto 0); + dti : in real; + dto : out real); +end entity mem_sync; + + +library ieee; +use ieee.numeric_bit.all; + + +architecture beh of mem_sync is + + signal addr_rg : unsigned(BA-1 downto 0); + +begin -- architecture beh + + mem: process (clk) is + constant mem_size : natural := 2**(addr'length); + type mem_ty is array (0 to mem_size-1) of real; + variable w_mem : mem_ty; + begin -- process mem + if clk'event and clk = '1' then -- rising clock edge + addr_rg <= unsigned(addr); + if wr='1' then + w_mem(to_integer(addr_rg)) := dti; + end if; + if rd='1' then + dto <= w_mem(to_integer(addr_rg)); + end if; + end if; + end process mem; + +end architecture beh; + + -- Local Variables: + -- compile-command: "ghdl -a --std=00 --workdir=../do_sim/ mem_sync.vhd" + -- End: diff --git a/hw/beh/pkg_sbs.vhd b/hw/beh/pkg_sbs.vhd new file mode 100644 index 0000000..ffc839f --- /dev/null +++ b/hw/beh/pkg_sbs.vhd @@ -0,0 +1,15 @@ +package pkg_sbs is + + constant BW_HU_CTR : natural := 10; -- bits for control + constant BW_HU_CFG : natural := 15; -- bits for configuration + + constant N_H_MAX : natural := 8; -- Max size of H + constant ADDR_H_MAX : natural := 3; -- log2 of N_H_MAX; it is size of addr + -- bus + + --subtype hu_ctr is bit_vector 4 downto 0; + + -- Array of reals with max size of H + type array_as_h is array (N_H_MAX-1 downto 0) of real; + +end package pkg_sbs; diff --git a/hw/beh/pkg_ufp.vhd b/hw/beh/pkg_ufp.vhd new file mode 100644 index 0000000..3d7f644 --- /dev/null +++ b/hw/beh/pkg_ufp.vhd @@ -0,0 +1,99 @@ +-- Library of functions to work with unsigned FP numbers +library ieee; +use ieee.std_logic_1164.all; + +package pkg_ufp is + -- Format + -- [ ee ][ mm ] + -- with total BW bits, and BE exponent bits and offset of exponent EO + -- b(ee) in EO-[0..2**BE-1] + -- b(mm) in [0..2**(BW-BE)-1]/2**(BW-BE) + + -- Convert a number if unsigned floating point to real + function ufp_to_real ( + ee_mm : std_logic_vector; -- Data in format exponent_mantissa as bits + BW : natural; -- Bit width + BE : natural; -- Number of bits used for exponent + EO : natural) -- Offset of exponent + return real; + + -- Convert a real number to unsigned floating point + function real_to_ufp ( + r : real; -- Real number to convert + BW : natural; -- Bit width + BE : natural; -- Number of bits used for exponent + EO : natural) -- Offset of exponent + return std_logic_vector; + + + +end package pkg_ufp; + +library ieee; +use ieee.numeric_std.all; +use ieee.math_real.all; + +package body pkg_ufp is + + function ufp_to_real ( + ee_mm : std_logic_vector; -- Data in format exponent_mantissa as bits + BW : natural; -- Bit width + BE : natural; -- Number of bits used for exponent + EO : natural) -- Offset of exponent + return real + is + variable mm : unsigned(BW-BE-1 downto 0); + variable ee : unsigned(BE-1 downto 0); + variable d : real; + begin + ee := unsigned(ee_mm(BW-1 downto BW-BE)); + mm := unsigned(ee_mm(BW-BE-1 downto 0)); + d := real(to_integer(mm)) * 2.0**real(EO-to_integer(ee)-(BW-BE)); + return d; + end function ufp_to_real; + + + function real_to_ufp ( + r : real; -- Real number to convert + BW : natural; -- Bit width + BE : natural; -- Number of bits used for exponent + EO : natural) -- Offset of exponent + return std_logic_vector + is + variable BO, MAX_EXP, BM : integer; + variable R_MIN, BM_LIM : real; + variable mm : unsigned(BW-BE-1 downto 0); + variable ee : unsigned(BE-1 downto 0); + variable aa : integer; -- Scaling to normalize r into ufp representation + variable mm_id : integer; + begin + BO := BW-BE-EO; -- Exponent of Scaling factor + BM := BW - BE; -- Bits for mantissa + BM_lim := log2(2.0**BM-1.0); -- A bit less than BM + MAX_EXP := 2**BE-1; -- Max exponent + R_MIN := 2.0**(-MAX_EXP-BO); -- Min value (not equal zero) + if r '1'); + mm := (others => '0'); + else + --aa := BM - integer(ceil(log2(r))); + aa := integer(floor(BM_LIM - log2(r))); + if aa < BO then + aa := BO; + end if; + if aa > MAX_EXP+BO then + aa := MAX_EXP+BO; + end if; + --report "[TST] aa=" & integer'image(aa) severity note; + --report "[TST] rr=" & real'image( r * 2.0**aa ) severity note; + + mm_id := integer(round(r * 2.0**aa)); + --mm_id := integer(floor(r * 2.0**aa)); -- ???? + ee := to_unsigned(aa-BO, ee'length); + mm := to_unsigned(mm_id, mm'length); + end if; + return std_logic_vector(ee) & std_logic_vector(mm); + end function real_to_ufp; + + +end package body pkg_ufp; diff --git a/hw/beh/tst_hu.vhd b/hw/beh/tst_hu.vhd new file mode 100644 index 0000000..df14dcc --- /dev/null +++ b/hw/beh/tst_hu.vhd @@ -0,0 +1,126 @@ +-- tst_hu +-- Testbench for Update H using stream of weights + +use work.pkg_sbs.all; + +entity tst_hu is + +end entity tst_hu; + + +architecture tst of tst_hu is + + constant T : time := 10 ns; -- Period + + signal clk, rstn : bit := '0'; + signal cfg_hu : bit_vector(BW_HU_CFG -1 downto 0); + signal ena_w : bit; + signal is_ini : bit; + signal is_fst : bit; + signal ena_ho : bit; + signal wi : real := 0.0; + signal hi : real := 0.0; + signal ho : real; + +begin -- architecture tst + + clk <= not clk after T/2; + rstn <= '0', '1' after T/2+T/4; + + i_hu: entity work.hu + port map ( + clk => clk, + rstn => rstn, + cfg_hu => cfg_hu, + ena_w => ena_w, + is_ini => is_ini, + is_fst => is_fst, + ena_ho => ena_ho, + wi => wi, + hi => hi, + ho => ho); + + process (clk) is + type array_sol is array (natural range <>) of real; + + -- Example of solution from python + constant h_sol : array_sol := ( + --0.1 , 0.2 , 0.3 , 0.0 , 0.01, 0.01, 0.1 , 0.28, + 0.01628 , 0.02056 , 0.03144 , 0.0 , 0.001228, 0.001588, 0.01228 , 0.039984, + 1.28343706e-04, 1.62085171e-04, 3.17669760e-04, 0.00000000e+00, 1.24077120e-05, 1.99630656e-05, 1.84671552e-04, 3.05349811e-04, + 2.17637063e-08, 2.74853687e-08, 5.38684101e-08, 0.00000000e+00, 2.10402060e-09, 3.38520923e-09, 3.13154230e-08, 5.17792718e-08 + ); + + variable idx : natural; + + begin -- process + if clk'event and clk = '1' then -- rising clock edge + if ena_ho='1' then + if idx 1.0e-09 then + report LF & ESC & "[31;1m [ERROR] h_sol= " & real'image(h_sol(idx)) & ESC & "[0m" & LF severity error; + end if; + idx := idx+1; + end if; + report LF & "[INFO] h_exp= " & real'image(ho) & LF severity note; + end if; + end if; + end process; + + + process is + constant h : array_as_h := (0.1, 0.2, 0.3, 0.0, 0.01, 0.01, 0.1, 0.28); + begin -- process + hi <= 0.0; + is_ini <= '0'; + wait for T/4 + T/2 + T; + + for n in 0 to 1 loop + for ki in h'range loop + hi <= h(ki); + is_ini <= '1'; + wait for T; + end loop; -- ki + is_ini <= '0'; + hi <= 0.0; + --wait for T*(h'length+1)*3; -- note +1 for void cycle + wait for T*(2+(h'length+2)*3); -- note +2 for void cycle + end loop; -- n + wait; + end process; + + + process is + constant w0 : array_as_h := (0.3, 0.0, 0.01, 0.01, 0.1, 0.28, 0.1, 0.2); + constant w1 : array_as_h := (0.01, 0.01, 0.1, 0.28, 0.1, 0.2, 0.3, 0.0); + constant w2 : array_as_h := (0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125); + + type array_w is array (0 to 2) of array_as_h; + constant w : array_w := (w0, w1, w2); + + begin -- process + ena_w <= '0'; + wi <= 0.0; + wait for T/4 + T/2 + T; + + for n in 0 to 1 loop + for kj in w'range loop + is_fst <= '1', '0' after T; + for ki in w0'range loop + ena_w <= '1'; + wi <= w(kj)(ki); + wait for T; + end loop; -- ki + ena_w <= '0'; -- void cycle + wait for 2*T; -- + end loop; -- kj + end loop; -- n + + ena_w <= '0'; + wait for 4*T; + report LF & LF & ESC & "[35;1m [TST] End simulation" & ESC & "[0m" & LF severity failure; + + end process; + + +end architecture tst; diff --git a/hw/beh/tst_pkg_ufp.vhd b/hw/beh/tst_pkg_ufp.vhd new file mode 100644 index 0000000..b0317cf --- /dev/null +++ b/hw/beh/tst_pkg_ufp.vhd @@ -0,0 +1,136 @@ +use work.pkg_ufp.all; + +entity tst_pkg_ufp is + +end entity tst_pkg_ufp; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +architecture tst of tst_pkg_ufp is + + + +begin -- architecture tst + + process + -- Define params of ufp + constant BW : natural := 10; + constant BE : natural := 3; + constant EO : natural := 1; + + constant BO : natural := BW - BE -EO; + + variable r, r2 : real; + variable ee_mm : std_logic_vector(BW-1 downto 0); + variable mm : std_logic_vector(BW-1-BE downto 0); + variable ee : std_logic_vector(BE-1 downto 0); + + constant r_ini : real := 0.0125; + constant r_inc : real := 0.0125/4.0; + constant N : natural := 10; + + procedure print ( + ar : in real; + aee : in std_logic_vector(BE-1 downto 0); + amm : in std_logic_vector(BW-1-BE downto 0)) is + begin + report LF & "[TST] " & + "r=" & real'image(ar) & HT & HT & + "ufp= " & integer'image(to_integer(unsigned(amm))) & + " *2**(-" & integer'image(BO) & "- " & integer'image(to_integer(unsigned(aee))) & " )" + severity note; + end procedure; + + + procedure tst_conv_r ( + aee : in std_logic_vector(BE-1 downto 0); + amm : in std_logic_vector(BW-1-BE downto 0)) is + variable ar : real; + variable aee_mm : std_logic_vector(BW-1 downto 0); + begin -- procedure tst_conv_r + aee_mm := aee & amm; + ar := ufp_to_real(aee_mm, BW, BE, EO); + print(ar, aee, amm); + + aee_mm := real_to_ufp(ar, BW, BE, EO); + print(ar, aee, amm); + + report LF & "[TST] ----------------------------------" severity note; + + end procedure tst_conv_r; + + constant ee_0min : std_logic_vector(BE-1 downto 0) := (others => '0'); + constant ee_1min : std_logic_vector(BE-1 downto 0) := (0 => '1', others => '0'); + constant ee_2min : std_logic_vector(BE-1 downto 0) := (1 => '1', others => '0'); + constant ee_0max : std_logic_vector(BE-1 downto 0) := (others => '1'); + constant ee_1max : std_logic_vector(BE-1 downto 0) := (0 => '0', others => '1'); + constant ee_2max : std_logic_vector(BE-1 downto 0) := (1 => '0', others => '1'); + + constant mm_0min : std_logic_vector(BW-1-BE downto 0) := (others => '0'); + constant mm_1min : std_logic_vector(BW-1-BE downto 0) := (0 => '1', others => '0'); + constant mm_2min : std_logic_vector(BW-1-BE downto 0) := (1 => '1', others => '0'); + constant mm_0max : std_logic_vector(BW-1-BE downto 0) := (others => '1'); + constant mm_1max : std_logic_vector(BW-1-BE downto 0) := (0 => '0', others => '1'); + constant mm_2max : std_logic_vector(BW-1-BE downto 0) := (1 => '0', others => '1'); + + begin + if true then + report LF & "[TST] Test corner examples =============================" severity note; + + -- Conversion from ee_mm to real + tst_conv_r(ee_0max, mm_0min); + tst_conv_r(ee_0max, mm_1min); + tst_conv_r(ee_0max, mm_2min); + tst_conv_r(ee_0max, mm_2max); + tst_conv_r(ee_0max, mm_1max); + tst_conv_r(ee_0max, mm_0max); + + tst_conv_r(ee_1max, mm_0min); + tst_conv_r(ee_1max, mm_1min); + tst_conv_r(ee_1max, mm_2min); + tst_conv_r(ee_1max, mm_2max); + tst_conv_r(ee_1max, mm_1max); + tst_conv_r(ee_1max, mm_0max); + + tst_conv_r(ee_1min, mm_0min); + tst_conv_r(ee_1min, mm_1min); + tst_conv_r(ee_1min, mm_2min); + tst_conv_r(ee_1min, mm_2max); + tst_conv_r(ee_1min, mm_1max); + tst_conv_r(ee_1min, mm_0max); + + tst_conv_r(ee_0min, mm_0min); + tst_conv_r(ee_0min, mm_1min); + tst_conv_r(ee_0min, mm_2min); + tst_conv_r(ee_0min, mm_2max); + tst_conv_r(ee_0min, mm_1max); + tst_conv_r(ee_0min, mm_0max); + + end if; + + if true then + report LF & "[TST] Test ramp =============================" severity note; + -- Conersion from real to ee_mm + r:= r_ini; + for ki in 0 to N-1 loop + ee_mm := real_to_ufp(r, BW, BE, EO); + + ee := ee_mm(BW-1 downto BW-BE); + mm := ee_mm(BW-BE-1 downto 0); + + print(r, ee, mm); + r2 := ufp_to_real(ee_mm, BW, BE, EO); + print(r2, ee, mm); + report LF & "[TST] ----------------------------------" severity note; + + r := r + r_inc; + end loop; -- ki + end if; +` + wait; + end process; + + +end architecture tst; diff --git a/hw/beh/wg_mem.vhd b/hw/beh/wg_mem.vhd new file mode 100644 index 0000000..f7aa6c3 --- /dev/null +++ b/hw/beh/wg_mem.vhd @@ -0,0 +1,126 @@ +-- wg_mem +-- +-- Generate weights using stream of idx +-- +-- Inputs are spike index and location of kernel to read +-- +-- Current implementation assumes that all weights are cached +-- and that sizes of and KI and KO are powers of 2 + +use work.pkg_sbs.all; + +entity wg_mem is + generic ( + LOG2_H : natural := 2; -- size of H (number of output IPs per + -- output location) + LOG2_KI : natural := 4; -- number IPs per input (thus spike index) + LOG2_KO : natural := 3); -- number connections from IPi + -- block to IPo block (thus, + -- number of output IPs of full connected, + -- kernel size in conv) + port ( + clk, rstn : in bit; + + -- Initial update + do_init_str : in bit; -- First step in init process + do_init_nxt : in bit; -- Next step in init process + w_init : in real; -- Weight value to update + + -- Normal + idx : in bit_vector(LOG2_KI-1 downto 0); -- Index of spike + pos : in bit_vector(LOG2_KO-1 downto 0); -- Location of output (edge, kernel) + ena_idx : in bit; + busy_idx : out bit; + ena_w : out bit; -- Send a weight + w : out real); -- stream of states + +end entity wg_mem; + +library ieee; +use ieee.numeric_bit.all; + +architecture rtl of wg_mem is + + signal busy_rg, busy_nxt, i_done, idx_done, pos_done : bit; + signal idx_rg, idx_nxt : unsigned(LOG2_KI-1 downto 0); + signal pos_rg, pos_nxt : unsigned(LOG2_KO-1 downto 0); + signal i_rg, i_nxt : unsigned(LOG2_H-1 downto 0); + + -- All these params could be configurable.. + constant I_LAST : unsigned(LOG2_H-1 downto 0) := (others=>'1'); + constant IDX_LAST : unsigned(LOG2_KI-1 downto 0) := (others=>'1'); + constant POS_LAST : unsigned(LOG2_KO-1 downto 0) := (others=>'1'); + + constant I_ZERO : unsigned(LOG2_H-1 downto 0) := (others=>'0'); + constant IDX_ZERO : unsigned(LOG2_KI-1 downto 0) := (others=>'0'); + constant POS_ZERO : unsigned(LOG2_KO-1 downto 0) := (others=>'0'); + + -- Memory + signal mem_addr, mem_addr_nxt : unsigned(LOG2_H+LOG2_KI+LOG2_KO-1 downto 0); + signal mem_wr, mem_rd : bit; + +begin -- architecture rtl + + busy_idx <= busy_rg; + ena_w <= busy_rg; + + i_done <= '1' when i_rg = I_ZERO else '0'; + idx_done <= '1' when idx_rg = IDX_ZERO else '0'; + pos_done <= '1' when pos_rg = POS_ZERO else '0'; + + busy_nxt <= '1' when ena_idx='1' else + '0' when i_done='1' else + busy_rg; + + i_nxt <= I_LAST when (ena_idx='1') or (do_init_str='1') else + i_nxt-1 when (busy_rg='1') or (do_init_nxt='1') else + i_rg; + + idx_nxt <= unsigned(idx) when ena_idx='1' else + IDX_LAST when do_init_str='1' else + idx_nxt-1 when (do_init_nxt='1') and (i_done='1') else + idx_rg; + + pos_nxt <= unsigned(pos) when ena_idx='1' else + POS_LAST when do_init_str='1' else + pos_nxt-1 when (do_init_nxt='1') and (idx_done='1') else + pos_rg; + + reg: process (clk, rstn) is + begin -- process reg + if rstn = '0' then -- asynchronous reset (active low) + idx_rg <= IDX_LAST; + pos_rg <= POS_LAST; + i_rg <= I_LAST; + busy_rg <= '0'; + elsif clk'event and clk = '1' then -- rising clock edge + idx_rg <= idx_nxt; + pos_rg <= pos_nxt; + i_rg <= i_nxt; + busy_rg <= busy_nxt; + end if; + end process reg; + + + mem_addr_nxt <= pos_nxt & idx_nxt & i_nxt; + mem_wr <= do_init_str or do_init_nxt; + mem_rd <= '1'; + + -- Implementation of a synchronous single port memory + mem: process (clk) is + constant mem_size : natural := 2**(mem_addr'length); + type mem_ty is array (0 to mem_size-1) of real; + variable w_mem : mem_ty; + begin -- process mem + if clk'event and clk = '1' then -- rising clock edge + mem_addr <= mem_addr_nxt; + if mem_wr='1' then + w_mem(to_integer(mem_addr)) := w_init; + end if; + if mem_rd='1' then + w <= w_mem(to_integer(mem_addr)); + end if; + end if; + end process mem; + +end architecture rtl; diff --git a/hw/do_sim/ex.py b/hw/do_sim/ex.py new file mode 100644 index 0000000..afcd35f --- /dev/null +++ b/hw/do_sim/ex.py @@ -0,0 +1,51 @@ +import numpy as np + +h = np.array([0.1, 0.2, 0.3, 0.0, 0.01, 0.01, 0.1, 0.28]) + +w0 = [0.3, 0.0, 0.01, 0.01, 0.1, 0.28, 0.1, 0.2] +w1 = [0.01, 0.01, 0.1, 0.28, 0.1, 0.2, 0.3, 0.0] +w2 = [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125] + +w = np.array([w0, w1, w2]) + +eps = 0.2 + +h0 = h +hw0 = h0 * w[0] +sum_hw0 = np.sum(hw0) + +h1 = (h0 + eps*hw0/sum_hw0) /(1+eps) +hw1 = h1 * w[1] +sum_hw1 = np.sum(hw1) + +h2 = (h1 + eps*hw1/sum_hw1) /(1+eps) +hw2 = h2 * w[2] +sum_hw2 = np.sum(hw2) + +h3 = (h2 + eps*hw2/sum_hw2) /(1+eps) + +hp0 = h0 +hpw0 = hp0 * w[0] +sum_hpw0 = np.sum(hpw0) +norm_hp0 = np.sum(hp0) + +hp1 = (sum_hpw0 * hp0 + eps * norm_hp0 * hpw0) +hpw1 = hp1 * w[1] +sum_hpw1 = np.sum(hpw1) +norm_hp1 = np.sum(hp1) + +hp2 = (sum_hpw1*hp1 + eps * norm_hp1 * hpw1) +hpw2 = hp2 * w[2] +sum_hpw2 = np.sum(hpw2) +norm_hp2 = np.sum(hp2) + +hp3 = (sum_hpw2*hp2 + eps * norm_hp2 * hpw2) + + +# Show that hp are just multiple of h +print(hp1/h1) +print(hp2/h2) +print(hp3/h3) + +# This should be the output of HW model +print(hp0, hp1, hp2, hp3) diff --git a/hw/do_sim/ex_ufp.py b/hw/do_sim/ex_ufp.py new file mode 100644 index 0000000..a4b23a9 --- /dev/null +++ b/hw/do_sim/ex_ufp.py @@ -0,0 +1,55 @@ +from math import log2, ceil, floor + +BW = 5 +BE = 3 +EO = 1 + +BM = BW-BE +BO = BM-EO # BW-BE-EO +EXP_MAX = 2**BE-1 +R_MIN = ufp_to_r(EXP_MAX, 1, BW, BE, EO) + + +def r_to_ufp(r, BW, BE, EO): + BM = BW-BE + BO = BM-EO # BW-BE-EO + EXP_MAX = 2**BE-1 + R_MIN = ufp_to_r(EXP_MAX, 1, BW, BE, EO) + + if r32,ports=>8. Also possible to use a file + analyze -library work -autoread -recursive ../rtl -top $UNIT_NAME + elaborate -library work $UNIT_NAME + link + check_design + + # Set constraints + flow_def_rst $rst + flow_def_clock $T $clk + flow_def_timing [expr $T/8] [expr $T/8] + check_timing + + # Synthesize + compile_ultra ;# Run the synthesize + #change_names -rules vhdl -hier -verbose -log_changes ./log/change_names.log + + # Write reports + #set prefix ${UNIT_NAME} + set prefix ${UNIT_NAME}_T=${T}_TECH=${TECH} ;# Define prefix to identify reports. + flow_report_all $prefix ;# Write reports + flow_write_netlist $prefix ;# Write results +} + diff --git a/hw/do_synth/source.csh b/hw/do_synth/source.csh new file mode 100644 index 0000000..ebedf4a --- /dev/null +++ b/hw/do_synth/source.csh @@ -0,0 +1,5 @@ +#source /eda/synopsys/synopsys_lic_init_2015-2016.csh +#source /eda/synopsys/2015-16/scripts/SYN_2015.06-SP4_RHELx86.csh + +setenv SNPSLMD_LICENSE_FILE "28231@item0096" +setenv PATH "/usrf01/prog/synopsys/syn/R-2020.09-SP4/bin:${PATH}" diff --git a/hw/rtl/hu_dp.vhd b/hw/rtl/hu_dp.vhd new file mode 100644 index 0000000..d99dac6 --- /dev/null +++ b/hw/rtl/hu_dp.vhd @@ -0,0 +1,135 @@ +-- hu_dp +-- Data path for Update H using stream of weights +-- Trivial fix point implementation +library ieee; +use ieee.std_logic_1164.all; +use work.pkg_sbs.all; + +entity hu_dp is + generic ( + K : natural := 3; -- additional bits for sum + B : natural := 10); -- bitwidth of input + port ( + clk, rstn : in std_logic; + ctr_hu : in std_logic_vector(BW_HU_CTR-1 downto 0); -- Control for data path + loc_h : out std_logic_vector(ADDR_H_MAX-1 downto 0); -- Current location in H + eps : in std_logic_vector(B-1 downto 0); + wi : in std_logic_vector(B-1 downto 0); -- stream of weights + hi : in std_logic_vector(B-1 downto 0); -- stream of state + ho : out std_logic_vector(B-1 downto 0)); -- stream of states + +end entity hu_dp; + +library ieee; +use ieee.numeric_std.all; + +architecture rtl of hu_dp is + +-- Memory + subtype word is std_logic_vector(B-1 downto 0); + type array_as_h_w is array (N_H_MAX-1 downto 0) of word; + + signal mem_hp : array_as_h_w; -- State (internal) + signal mem_hw : array_as_h_w; -- Copy of w*h + signal addr_wr, addr_nxt : std_logic_vector(ADDR_H_MAX-1 downto 0); -- Address + + -- Data path for hp (i.t. h un-normalized) and hw (hp*w) + signal hp_new, hw_nxt : unsigned(2*B-1 downto 0); + + signal hp_new_rg, hp_p, h_eff : std_logic_vector(B-1 downto 0); + signal hw_p : std_logic_vector(B-1 downto 0); + + -- Accumulators for normalization + signal sum_hw, sum_hw_nxt : std_logic_vector(B-1 downto 0); -- Running sum hw + signal sum_hw_p, sum_hw_p_nxt : std_logic_vector(B-1 downto 0); -- Saved sum hw of previous + signal sum_hp, sum_hp_nxt : std_logic_vector(B-1 downto 0); -- Running sum hp + signal sum_hp_p, sum_hp_p_nxt : std_logic_vector(B-1 downto 0); -- Saved sum hw of previous (normalization) + + -- Control signals + signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : std_logic; + signal ctr_addr_rst, ctr_addr_inc, ctr_write_hw : std_logic; + signal ctr_wr_hw, ctr_wr_hp : std_logic; + +begin -- architecture rtlf + + -- Get control signals + ctr_sel_ini <= ctr_hu(0); + ctr_sum_ini <= ctr_hu(1); + ctr_update_sum <= ctr_hu(2); + ctr_addr_rst <= ctr_hu(3); + ctr_addr_inc <= ctr_hu(4); + ctr_write_hw <= ctr_hu(5); + ctr_wr_hw <= ctr_hu(6); + ctr_wr_hp <= ctr_hu(6); + ctr_update_sum2 <= ctr_hu(7); + + + -- Main calculation + hp_new <= unsigned(hp_p) * unsigned(sum_hw_p) + unsigned(sum_hp_p) * unsigned(hw_p) ; + + -- Mux to select first h or saved one + h_eff <= hi when ctr_sel_ini='1' else hp_new_rg; + + -- Calculate hw + hw_nxt <= unsigned(h_eff) * unsigned(wi) ; + + -- Output h (note latency of a complete group) + ho <= h_eff; + + -- Accumulate hw and hp + sum_hw_nxt <= std_logic_vector(hw_nxt(2*B-1 downto B)) when ctr_sum_ini='1' else std_logic_vector(unsigned(sum_hw) + hw_nxt(2*B-1 downto B)); + sum_hw_p_nxt <= (others=>'0') when ctr_update_sum='1' else + sum_hw when ctr_update_sum2='1' else sum_hw_p; + + sum_hp_nxt <= h_eff when ctr_sum_ini='1' else std_logic_vector(unsigned(sum_hp) + unsigned(h_eff)); -- Accumulate h + --sum_hp_p_nxt <= sum_hp_nxt * eps when ctr_update_sum='1' else sum_hp_p; + sum_hp_p_nxt <= eps when ctr_update_sum='1' else + std_logic_vector(hp_new(2*B-1 downto B)) when ctr_update_sum2='1' else sum_hp_p; + + + -- Read from memory + --hw_p <= mem_hw(to_integer(unsigned(addr_nxt))); + hw_p <= mem_hw(to_integer(unsigned(addr_nxt))) + when ctr_update_sum2='0' else sum_hp; -- Put sum_hp in mult + hp_p <= mem_hp(to_integer(unsigned(addr_nxt))); + + -- Address calculation + addr_nxt <= (others => '0') when ctr_addr_rst='1' else + std_logic_vector(unsigned(addr_wr) + 1) when ctr_addr_inc='1' else addr_wr; + loc_h <= addr_wr; -- Output for ctrl path + + -- Registers + rg: process (clk, rstn) is + begin -- process pipe1 + if rstn = '0' then + hp_new_rg <= (others=>'0'); + sum_hw <= (others=>'0'); + sum_hw_p <= (others=>'0'); + sum_hp <= (others=>'0'); + sum_hp_p <= (others=>'0'); + addr_wr <= (others => '0'); + elsif clk'event and clk = '1' then + hp_new_rg <= std_logic_vector(hp_new(2*B-1 downto B)); + sum_hw <= sum_hw_nxt; + sum_hw_p <= sum_hw_p_nxt; + sum_hp <= sum_hp_nxt; + sum_hp_p <= sum_hp_p_nxt; + addr_wr <= addr_nxt; + end if; + end process rg; + + + -- Memory + mem: process (clk) is + begin -- process mem + if clk'event and clk = '1' then -- rising clock edge + if ctr_wr_hw='1' then + mem_hw(to_integer(unsigned(addr_wr))) <= std_logic_vector(hw_nxt(2*B-1 downto B)); + end if; + if ctr_wr_hp='1' then + mem_hp(to_integer(unsigned(addr_wr))) <= h_eff; + end if; + end if; + end process mem; + +end architecture rtl; diff --git a/hw/rtl/mua.vhd b/hw/rtl/mua.vhd new file mode 100644 index 0000000..ee348ad --- /dev/null +++ b/hw/rtl/mua.vhd @@ -0,0 +1,52 @@ +-- Simple multiply with adder to check speed + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + + +entity mua is + + generic ( + B : natural := 10); -- bitwidth + + port ( + clk, arstn : in std_logic; + dt_mv, dt_mc : in std_logic_vector(B-1 downto 0); -- input for multiplicatin + dt_add : in std_logic_vector(2*B-1 downto 0); -- constant to add + dt_mua : out std_logic_vector(2*B-1 downto 0)); -- output + +end entity mua; + + +library ieee; +use ieee.numeric_std.all; + +architecture rtl of mua is + + signal dt_mv_rg, dt_mc_rg : unsigned(B-1 downto 0); + signal dt_add_rg : unsigned(2*B-1 downto 0); + signal dt_mua_rg, dt_mua_nxt : unsigned(2*B-1 downto 0); + +begin -- architecture rtl + + dt_mua_nxt <= dt_mv_rg * dt_mc_rg + dt_add_rg; + + dt_mua <= std_logic_vector(dt_mua_rg); + + reg: process (clk, arstn) is + begin -- process reg + if arstn = '0' then -- asynchronous reset (active low) + dt_mv_rg <= (others=>'0'); + dt_mc_rg <= (others=>'0'); + dt_add_rg <= (others=>'0'); + dt_mua_rg <= (others=>'0'); + elsif clk'event and clk = '1' then -- rising clock edge + dt_mv_rg <= unsigned(dt_mv); + dt_mc_rg <= unsigned(dt_mc); + dt_add_rg <= unsigned(dt_add); + dt_mua_rg <= dt_mua_nxt; + end if; + end process reg; + +end architecture rtl; diff --git a/hw/rtl/mult_unsgn_pp_trunc.vhd b/hw/rtl/mult_unsgn_pp_trunc.vhd new file mode 100644 index 0000000..fd53a35 --- /dev/null +++ b/hw/rtl/mult_unsgn_pp_trunc.vhd @@ -0,0 +1,65 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity mult_unsgn_pp_trunc is + + generic ( + BWa : natural := 16; -- Bit width of Multiplier + BWb : natural := 16; + K : natural := 15); -- Vertical truncation + + port ( + da : in std_logic_vector(BWa-1 downto 0); + db : in std_logic_vector(BWb-1 downto 0); + dout : out std_logic_vector(BWa+BWb-1 downto 0)); + +end mult_unsgn_pp_trunc; + +architecture str of mult_unsgn_pp_trunc is + + type stlv_array is array (0 to BWa-1) of std_logic_vector(BWa+BWb-2 downto 0); + signal pp : stlv_array; + --signal pp_res : std_logic_vector(BWa+BWb-1 downto 0); + +begin -- str + + ppGen : process (da, db) + variable ppt : stlv_array; + begin + ppt := (others => (others => '0')); + -- partial products da(i)db(j) EX: + -- da(0)db(3) da(0)db(2) da(0)db(1) da(0)db(0) + -- da(1)db(3) da(1)db(2) da(1)db(1) da(1)db(0) + -- da(2)db(3) da(2)db(2) da(2)db(1) da(2)db(0) + -- da(3)db(3) da(3)db(2) da(3)db(1) da(3)db(0) + for i in 0 to BWa-1 loop + for j in 0 to BWb-1 loop + if (i+j > K-1) then + ppt(i)(i+j) := da(i) and db(j); + end if; + end loop; + end loop; + PP <= ppt; + end process ppGen; + + + CSA_tree : process (pp) + variable pp_add : std_logic_vector(BWa+BWb-1 downto 0); + begin -- process CSA_tree + for i in 0 to BWa-1 loop + if i = 0 then + pp_add := '0' & pp(0)(BWa+BWb-2 downto 0); + else + pp_add := std_logic_vector(unsigned('0'&pp(i)(BWa+BWb-2 downto 0)) + unsigned(pp_add)); + end if; + end loop; -- i + --pp_res <= pp_add; + dout <= pp_add; + end process CSA_tree; + + -- dout <= pp_res; + --dout(BWa+BWb-1 downto 16) <= pp_res(BWa+BWb-1 downto 16); + --dout(15 downto 0) <= (others => '0'); + +end str; diff --git a/hw/rtl/two_mult_unsgn_pp_trunc.vhd b/hw/rtl/two_mult_unsgn_pp_trunc.vhd new file mode 100644 index 0000000..39d2cc4 --- /dev/null +++ b/hw/rtl/two_mult_unsgn_pp_trunc.vhd @@ -0,0 +1,86 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity two_mult_unsgn_pp_trunc is + + generic ( + BWa : natural := 16; -- Bit width of Multiplier + BWb : natural := 16; + K : natural := 15); -- Vertical truncation + + port ( + da : in std_logic_vector(BWa-1 downto 0); + db : in std_logic_vector(BWb-1 downto 0); + dc : in std_logic_vector(BWa-1 downto 0); + dd : in std_logic_vector(BWb-1 downto 0); + dout : out std_logic_vector(BWa+BWb downto 0)); + +end two_mult_unsgn_pp_trunc; + +architecture str of two_mult_unsgn_pp_trunc is + + type stlv_array is array (0 to 2*BWa-1) of std_logic_vector(BWa+BWb-2 downto 0); + signal pp : stlv_array; + --signal pp_res : std_logic_vector(BWa+BWb downto 0); + +begin -- str + + ppGen1 : process (da, db) + variable ppt : stlv_array; + begin + ppt := (others => (others => '0')); + -- partial products da(i)db(j) EX: + -- da(0)db(3) da(0)db(2) da(0)db(1) da(0)db(0) + -- da(1)db(3) da(1)db(2) da(1)db(1) da(1)db(0) + -- da(2)db(3) da(2)db(2) da(2)db(1) da(2)db(0) + -- da(3)db(3) da(3)db(2) da(3)db(1) da(3)db(0) + for i in 0 to BWa-1 loop + for j in 0 to BWb-1 loop + if (i+j > K-1) then + ppt(i)(i+j) := da(i) and db(j); + end if; + end loop; + end loop; + PP(0 to BWa-1) <= ppt(0 to BWa-1); + end process ppGen1; + + ppGen2 : process (dc, dd) + variable ppt : stlv_array; + begin + ppt := (others => (others => '0')); + -- partial products dc(i)dd(j) EX: + -- dc(0)dd(3) dc(0)dd(2) dc(0)dd(1) dc(0)dd(0) + -- dc(1)dd(3) dc(1)dd(2) dc(1)dd(1) dc(1)dd(0) + -- dc(2)dd(3) dc(2)dd(2) dc(2)dd(1) dc(2)dd(0) + -- dc(3)dd(3) dc(3)dd(2) dc(3)dd(1) dc(3)dd(0) + for i in 0 to BWa-1 loop + for j in 0 to BWb-1 loop + if (i+j > K-1) then + ppt(i)(i+j) := dc(i) and dd(j); + end if; + end loop; + end loop; + PP(BWa to 2*BWa-1) <= ppt(0 to BWa-1); + end process ppGen2; + + + CSA_tree : process (pp) + variable pp_add : std_logic_vector(BWa+BWb downto 0); + begin -- process CSA_tree + for i in 0 to 2*BWa-1 loop + if i = 0 then + pp_add := "00" & pp(0)(BWa+BWb-2 downto 0); + else + pp_add := std_logic_vector(unsigned(pp(i)(BWa+BWb-2 downto 0)) + unsigned(pp_add)); + end if; + end loop; -- i + --pp_res <= pp_add; + dout <= pp_add; + end process CSA_tree; + + -- dout <= pp_res; + --dout(BWa+BWb downto 16) <= pp_res(BWa+BWb-1 downto 16); + --dout(15 downto 0) <= (others => '0'); + +end str; diff --git a/hw/sbs_hw.org b/hw/sbs_hw.org new file mode 100644 index 0000000..07922e1 --- /dev/null +++ b/hw/sbs_hw.org @@ -0,0 +1,29 @@ +#+TITLE: sbs_hw.org + +* Idea + Start with HW implementation. + +* Modules + hu_ : h update + wg_ : weight generator + sg_ : spikes generator + +* HU +This block updates H according to the SbS equations. +- First version done. +- Need to decide if reciprocal or multiplication + + +* WG +This block generates a stream of weights given a stream of spikes +- instead of stream of spikes scaned per region, it could be better to receive a spike identifier, and + then all the edges (from location to location) that use this particular spike. + + + + +* New strategy for read + +When doing a convolution, we can read + +