Add initial code from old version of SbS core

This commit is contained in:
Alberto Garcia-Ortiz 2024-10-20 19:03:44 +02:00
parent 768d16789e
commit a365b9b01d
20 changed files with 1332 additions and 0 deletions

57
hw/beh/hu.vhd Normal file
View file

@ -0,0 +1,57 @@
-- hu
-- Update H using stream of weights
use work.pkg_sbs.all;
entity hu is
port (
clk, rstn : in bit;
cfg_hu : in bit_vector(BW_HU_CFG -1 downto 0); -- Config
ena_w : in bit; -- New weight
is_ini : in bit; -- First vector (get w and h when ena)
is_fst : in bit; -- Fist component in vector
ena_ho : out bit; -- Signal a valid ho value
wi : in real; -- stream of weights
hi : in real; -- stream of state
ho : out real); -- stream of states
end entity hu;
architecture rtlf of hu is
signal ctr_hu : bit_vector(BW_HU_CTR-1 downto 0);
signal loc_h : bit_vector(ADDR_H_MAX-1 downto 0);
signal eps : real;
begin -- architecture rtlf
i_hu_dp: entity work.hu_dp
port map (
clk => clk,
rstn => rstn,
eps => eps,
ctr_hu => ctr_hu,
loc_h => loc_h,
wi => wi,
hi => hi,
ho => ho);
i_hu_ctr: entity work.hu_ctr
port map (
clk => clk,
rstn => rstn,
eps => eps,
cfg_hu => cfg_hu,
loc_h => loc_h,
ena_w => ena_w,
is_ini => is_ini,
is_fst => is_fst,
ena_ho => ena_ho,
ctr_hu => ctr_hu);
end architecture rtlf;

72
hw/beh/hu_ctr.vhd Normal file
View file

@ -0,0 +1,72 @@
-- hu_ctr
-- Control path for Update H using stream of weights
use work.pkg_sbs.all;
entity hu_ctr is
port (
clk, rstn : in bit;
cfg_hu : in bit_vector(BW_HU_CFG -1 downto 0); -- Config
ena_w : in bit; -- New weight
is_ini : in bit; -- First vector (get w and h when ena)
is_fst : in bit; -- Fist component in vector
loc_h : in bit_vector(ADDR_H_MAX-1 downto 0); -- Current location in H
ena_ho : out bit; -- Signal a valid ho value
eps : out real;
ctr_hu : out bit_vector(BW_HU_CTR-1 downto 0)); -- Control for data path
end entity hu_ctr;
library ieee;
use ieee.numeric_bit.all;
architecture beh of hu_ctr is
signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : bit;
signal ctr_addr_rst, ctr_addr_inc, ctr_write_hw : bit;
signal ctr_wr_hw, ctr_wr_hp : bit;
-- Number of elements in H (currently fixed)
constant MAX_LOC_H : bit_vector(ADDR_H_MAX-1 downto 0) := bit_vector(to_unsigned(8, ADDR_H_MAX));
--constant T : time := 10 ns;
begin -- architecture beh
eps <= 0.2;
ctr_hu(0) <= ctr_sel_ini ;
ctr_hu(1) <= ctr_sum_ini ;
ctr_hu(2) <= ctr_update_sum ;
ctr_hu(3) <= ctr_addr_rst ;
ctr_hu(4) <= ctr_addr_inc ;
ctr_hu(5) <= ctr_write_hw ;
ctr_hu(6) <= ctr_wr_hw ;
--ctr_hu(7) <= ctr_wr_hp ; -- ctr_wr_hp and ctr_wr_hw are the same
ctr_hu(7) <= ctr_update_sum2 ;
-- Code in first approximation
ctr_sel_ini <= is_ini;
ctr_wr_hp <= ena_w; --is_ini;
ctr_wr_hw <= ena_w;
ctr_sum_ini <= is_fst;
ctr_update_sum <= transport is_fst after 7*T ;
--ctr_update_sum2 <= transport ctr_update_sum after T;
--ctr_update_sum <= '1' when (loc_h = MAX_LOC_H) else '0';
ctr_addr_rst <= ctr_update_sum;
ctr_addr_inc <= ena_w and not ctr_addr_rst;
ena_ho <= ena_w and not is_ini;
rg: process (clk, rstn) is
begin
if rstn = '0' then -- asynchronous reset (active low)
ctr_update_sum2 <= '0';
elsif clk'event and clk = '1' then -- rising clock edge
ctr_update_sum2 <= ctr_update_sum;
end if;
end process rg;
end architecture beh;

124
hw/beh/hu_dp.vhd Normal file
View file

@ -0,0 +1,124 @@
-- hu_dp
-- Data path for Update H using stream of weights
use work.pkg_sbs.all;
entity hu_dp is
port (
clk, rstn : in bit;
ctr_hu : in bit_vector(BW_HU_CTR-1 downto 0); -- Control for data path
loc_h : out bit_vector(ADDR_H_MAX-1 downto 0); -- Current location in H
eps : in real;
wi : in real; -- stream of weights
hi : in real; -- stream of state
ho : out real); -- stream of states
end entity hu_dp;
library ieee;
use ieee.numeric_bit.all;
architecture rtlf of hu_dp is
-- Memory
signal mem_hp : array_as_h; -- State (internal)
signal mem_hw : array_as_h; -- Copy of w*h
signal addr_wr, addr_nxt : bit_vector(ADDR_H_MAX-1 downto 0); -- Address
-- Data path for hp (i.t. h un-normalized) and hw (hp*w)
signal hp_new, hp_new_rg, hp_p, h_eff : real := 0.0;
signal hw_p, hw_nxt : real := 0.0;
-- Accumulators for normalization
signal sum_hw, sum_hw_nxt : real := 0.0; -- Running sum hw
signal sum_hw_p, sum_hw_p_nxt : real := 0.0; -- Saved sum hw of previous
signal sum_hp, sum_hp_nxt : real := 0.0; -- Running sum hp
signal sum_hp_p, sum_hp_p_nxt : real := 0.0; -- Saved sum hw of previous (normalization)
-- Control signals
signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : bit;
signal ctr_addr_rst, ctr_addr_inc, ctr_write_hw : bit;
signal ctr_wr_hw, ctr_wr_hp : bit;
begin -- architecture rtlf
-- Get control signals
ctr_sel_ini <= ctr_hu(0);
ctr_sum_ini <= ctr_hu(1);
ctr_update_sum <= ctr_hu(2);
ctr_addr_rst <= ctr_hu(3);
ctr_addr_inc <= ctr_hu(4);
ctr_write_hw <= ctr_hu(5);
ctr_wr_hw <= ctr_hu(6);
ctr_wr_hp <= ctr_hu(6);
ctr_update_sum2 <= ctr_hu(7);
-- Main calculation
hp_new <= hp_p * sum_hw_p + sum_hp_p * hw_p ;
-- Mux to select first h or saved one
h_eff <= hi when ctr_sel_ini='1' else hp_new_rg;
-- Calculate hw
hw_nxt <= h_eff * wi ;
-- Output h (note latency of a complete group)
ho <= h_eff;
-- Accumulate hw and hp
sum_hw_nxt <= hw_nxt when ctr_sum_ini='1' else sum_hw + hw_nxt;
sum_hw_p_nxt <= 0.0 when ctr_update_sum='1' else
sum_hw when ctr_update_sum2='1' else sum_hw_p;
sum_hp_nxt <= h_eff when ctr_sum_ini='1' else sum_hp + h_eff; -- Accumulate h
--sum_hp_p_nxt <= sum_hp_nxt * eps when ctr_update_sum='1' else sum_hp_p;
sum_hp_p_nxt <= eps when ctr_update_sum='1' else
hp_new when ctr_update_sum2='1' else sum_hp_p;
-- Read from memory
--hw_p <= mem_hw(to_integer(unsigned(addr_nxt)));
hw_p <= mem_hw(to_integer(unsigned(addr_nxt)))
when ctr_update_sum2='0' else sum_hp; -- Put sum_hp in mult
hp_p <= mem_hp(to_integer(unsigned(addr_nxt)));
-- Address calculation
addr_nxt <= (others => '0') when ctr_addr_rst='1' else
bit_vector(unsigned(addr_wr) + 1) when ctr_addr_inc='1' else addr_wr;
loc_h <= addr_wr; -- Output for ctrl path
-- Registers
rg: process (clk, rstn) is
begin -- process pipe1
if rstn = '0' then
hp_new_rg <= 0.0;
sum_hw <= 0.0;
sum_hw_p <= 0.0;
sum_hp <= 0.0;
sum_hp_p <= 0.0;
addr_wr <= (others => '0');
elsif clk'event and clk = '1' then
hp_new_rg <= hp_new;
sum_hw <= sum_hw_nxt;
sum_hw_p <= sum_hw_p_nxt;
sum_hp <= sum_hp_nxt;
sum_hp_p <= sum_hp_p_nxt;
addr_wr <= addr_nxt;
end if;
end process rg;
-- Memory
mem: process (clk) is
begin -- process mem
if clk'event and clk = '1' then -- rising clock edge
if ctr_wr_hw='1' then
mem_hw(to_integer(unsigned(addr_wr))) <= hw_nxt;
end if;
if ctr_wr_hp='1' then
mem_hp(to_integer(unsigned(addr_wr))) <= h_eff;
end if;
end if;
end process mem;
end architecture rtlf;

49
hw/beh/mem_sync.vhd Normal file
View file

@ -0,0 +1,49 @@
-- Implementation of a synchronous single port memory
library ieee;
use ieee.numeric_bit.all;
entity mem_sync is
generic(
BA : natural := 7); -- log2 addresses
port(
clk : in bit;
wr, rd : in bit;
addr : in bit_vector(BA-1 downto 0);
dti : in real;
dto : out real);
end entity mem_sync;
library ieee;
use ieee.numeric_bit.all;
architecture beh of mem_sync is
signal addr_rg : unsigned(BA-1 downto 0);
begin -- architecture beh
mem: process (clk) is
constant mem_size : natural := 2**(addr'length);
type mem_ty is array (0 to mem_size-1) of real;
variable w_mem : mem_ty;
begin -- process mem
if clk'event and clk = '1' then -- rising clock edge
addr_rg <= unsigned(addr);
if wr='1' then
w_mem(to_integer(addr_rg)) := dti;
end if;
if rd='1' then
dto <= w_mem(to_integer(addr_rg));
end if;
end if;
end process mem;
end architecture beh;
-- Local Variables:
-- compile-command: "ghdl -a --std=00 --workdir=../do_sim/ mem_sync.vhd"
-- End:

15
hw/beh/pkg_sbs.vhd Normal file
View file

@ -0,0 +1,15 @@
package pkg_sbs is
constant BW_HU_CTR : natural := 10; -- bits for control
constant BW_HU_CFG : natural := 15; -- bits for configuration
constant N_H_MAX : natural := 8; -- Max size of H
constant ADDR_H_MAX : natural := 3; -- log2 of N_H_MAX; it is size of addr
-- bus
--subtype hu_ctr is bit_vector 4 downto 0;
-- Array of reals with max size of H
type array_as_h is array (N_H_MAX-1 downto 0) of real;
end package pkg_sbs;

99
hw/beh/pkg_ufp.vhd Normal file
View file

@ -0,0 +1,99 @@
-- Library of functions to work with unsigned FP numbers
library ieee;
use ieee.std_logic_1164.all;
package pkg_ufp is
-- Format
-- [ ee ][ mm ]
-- with total BW bits, and BE exponent bits and offset of exponent EO
-- b(ee) in EO-[0..2**BE-1]
-- b(mm) in [0..2**(BW-BE)-1]/2**(BW-BE)
-- Convert a number if unsigned floating point to real
function ufp_to_real (
ee_mm : std_logic_vector; -- Data in format exponent_mantissa as bits
BW : natural; -- Bit width
BE : natural; -- Number of bits used for exponent
EO : natural) -- Offset of exponent
return real;
-- Convert a real number to unsigned floating point
function real_to_ufp (
r : real; -- Real number to convert
BW : natural; -- Bit width
BE : natural; -- Number of bits used for exponent
EO : natural) -- Offset of exponent
return std_logic_vector;
end package pkg_ufp;
library ieee;
use ieee.numeric_std.all;
use ieee.math_real.all;
package body pkg_ufp is
function ufp_to_real (
ee_mm : std_logic_vector; -- Data in format exponent_mantissa as bits
BW : natural; -- Bit width
BE : natural; -- Number of bits used for exponent
EO : natural) -- Offset of exponent
return real
is
variable mm : unsigned(BW-BE-1 downto 0);
variable ee : unsigned(BE-1 downto 0);
variable d : real;
begin
ee := unsigned(ee_mm(BW-1 downto BW-BE));
mm := unsigned(ee_mm(BW-BE-1 downto 0));
d := real(to_integer(mm)) * 2.0**real(EO-to_integer(ee)-(BW-BE));
return d;
end function ufp_to_real;
function real_to_ufp (
r : real; -- Real number to convert
BW : natural; -- Bit width
BE : natural; -- Number of bits used for exponent
EO : natural) -- Offset of exponent
return std_logic_vector
is
variable BO, MAX_EXP, BM : integer;
variable R_MIN, BM_LIM : real;
variable mm : unsigned(BW-BE-1 downto 0);
variable ee : unsigned(BE-1 downto 0);
variable aa : integer; -- Scaling to normalize r into ufp representation
variable mm_id : integer;
begin
BO := BW-BE-EO; -- Exponent of Scaling factor
BM := BW - BE; -- Bits for mantissa
BM_lim := log2(2.0**BM-1.0); -- A bit less than BM
MAX_EXP := 2**BE-1; -- Max exponent
R_MIN := 2.0**(-MAX_EXP-BO); -- Min value (not equal zero)
if r<R_MIN then -- If too small, set to zero
ee := (others => '1');
mm := (others => '0');
else
--aa := BM - integer(ceil(log2(r)));
aa := integer(floor(BM_LIM - log2(r)));
if aa < BO then
aa := BO;
end if;
if aa > MAX_EXP+BO then
aa := MAX_EXP+BO;
end if;
--report "[TST] aa=" & integer'image(aa) severity note;
--report "[TST] rr=" & real'image( r * 2.0**aa ) severity note;
mm_id := integer(round(r * 2.0**aa));
--mm_id := integer(floor(r * 2.0**aa)); -- ????
ee := to_unsigned(aa-BO, ee'length);
mm := to_unsigned(mm_id, mm'length);
end if;
return std_logic_vector(ee) & std_logic_vector(mm);
end function real_to_ufp;
end package body pkg_ufp;

126
hw/beh/tst_hu.vhd Normal file
View file

@ -0,0 +1,126 @@
-- tst_hu
-- Testbench for Update H using stream of weights
use work.pkg_sbs.all;
entity tst_hu is
end entity tst_hu;
architecture tst of tst_hu is
constant T : time := 10 ns; -- Period
signal clk, rstn : bit := '0';
signal cfg_hu : bit_vector(BW_HU_CFG -1 downto 0);
signal ena_w : bit;
signal is_ini : bit;
signal is_fst : bit;
signal ena_ho : bit;
signal wi : real := 0.0;
signal hi : real := 0.0;
signal ho : real;
begin -- architecture tst
clk <= not clk after T/2;
rstn <= '0', '1' after T/2+T/4;
i_hu: entity work.hu
port map (
clk => clk,
rstn => rstn,
cfg_hu => cfg_hu,
ena_w => ena_w,
is_ini => is_ini,
is_fst => is_fst,
ena_ho => ena_ho,
wi => wi,
hi => hi,
ho => ho);
process (clk) is
type array_sol is array (natural range <>) of real;
-- Example of solution from python
constant h_sol : array_sol := (
--0.1 , 0.2 , 0.3 , 0.0 , 0.01, 0.01, 0.1 , 0.28,
0.01628 , 0.02056 , 0.03144 , 0.0 , 0.001228, 0.001588, 0.01228 , 0.039984,
1.28343706e-04, 1.62085171e-04, 3.17669760e-04, 0.00000000e+00, 1.24077120e-05, 1.99630656e-05, 1.84671552e-04, 3.05349811e-04,
2.17637063e-08, 2.74853687e-08, 5.38684101e-08, 0.00000000e+00, 2.10402060e-09, 3.38520923e-09, 3.13154230e-08, 5.17792718e-08
);
variable idx : natural;
begin -- process
if clk'event and clk = '1' then -- rising clock edge
if ena_ho='1' then
if idx<h_sol'length-1 then
if abs(ho-h_sol(idx)) > 1.0e-09 then
report LF & ESC & "[31;1m [ERROR] h_sol= " & real'image(h_sol(idx)) & ESC & "[0m" & LF severity error;
end if;
idx := idx+1;
end if;
report LF & "[INFO] h_exp= " & real'image(ho) & LF severity note;
end if;
end if;
end process;
process is
constant h : array_as_h := (0.1, 0.2, 0.3, 0.0, 0.01, 0.01, 0.1, 0.28);
begin -- process
hi <= 0.0;
is_ini <= '0';
wait for T/4 + T/2 + T;
for n in 0 to 1 loop
for ki in h'range loop
hi <= h(ki);
is_ini <= '1';
wait for T;
end loop; -- ki
is_ini <= '0';
hi <= 0.0;
--wait for T*(h'length+1)*3; -- note +1 for void cycle
wait for T*(2+(h'length+2)*3); -- note +2 for void cycle
end loop; -- n
wait;
end process;
process is
constant w0 : array_as_h := (0.3, 0.0, 0.01, 0.01, 0.1, 0.28, 0.1, 0.2);
constant w1 : array_as_h := (0.01, 0.01, 0.1, 0.28, 0.1, 0.2, 0.3, 0.0);
constant w2 : array_as_h := (0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125);
type array_w is array (0 to 2) of array_as_h;
constant w : array_w := (w0, w1, w2);
begin -- process
ena_w <= '0';
wi <= 0.0;
wait for T/4 + T/2 + T;
for n in 0 to 1 loop
for kj in w'range loop
is_fst <= '1', '0' after T;
for ki in w0'range loop
ena_w <= '1';
wi <= w(kj)(ki);
wait for T;
end loop; -- ki
ena_w <= '0'; -- void cycle
wait for 2*T; --
end loop; -- kj
end loop; -- n
ena_w <= '0';
wait for 4*T;
report LF & LF & ESC & "[35;1m [TST] End simulation" & ESC & "[0m" & LF severity failure;
end process;
end architecture tst;

136
hw/beh/tst_pkg_ufp.vhd Normal file
View file

@ -0,0 +1,136 @@
use work.pkg_ufp.all;
entity tst_pkg_ufp is
end entity tst_pkg_ufp;
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
architecture tst of tst_pkg_ufp is
begin -- architecture tst
process
-- Define params of ufp
constant BW : natural := 10;
constant BE : natural := 3;
constant EO : natural := 1;
constant BO : natural := BW - BE -EO;
variable r, r2 : real;
variable ee_mm : std_logic_vector(BW-1 downto 0);
variable mm : std_logic_vector(BW-1-BE downto 0);
variable ee : std_logic_vector(BE-1 downto 0);
constant r_ini : real := 0.0125;
constant r_inc : real := 0.0125/4.0;
constant N : natural := 10;
procedure print (
ar : in real;
aee : in std_logic_vector(BE-1 downto 0);
amm : in std_logic_vector(BW-1-BE downto 0)) is
begin
report LF & "[TST] " &
"r=" & real'image(ar) & HT & HT &
"ufp= " & integer'image(to_integer(unsigned(amm))) &
" *2**(-" & integer'image(BO) & "- " & integer'image(to_integer(unsigned(aee))) & " )"
severity note;
end procedure;
procedure tst_conv_r (
aee : in std_logic_vector(BE-1 downto 0);
amm : in std_logic_vector(BW-1-BE downto 0)) is
variable ar : real;
variable aee_mm : std_logic_vector(BW-1 downto 0);
begin -- procedure tst_conv_r
aee_mm := aee & amm;
ar := ufp_to_real(aee_mm, BW, BE, EO);
print(ar, aee, amm);
aee_mm := real_to_ufp(ar, BW, BE, EO);
print(ar, aee, amm);
report LF & "[TST] ----------------------------------" severity note;
end procedure tst_conv_r;
constant ee_0min : std_logic_vector(BE-1 downto 0) := (others => '0');
constant ee_1min : std_logic_vector(BE-1 downto 0) := (0 => '1', others => '0');
constant ee_2min : std_logic_vector(BE-1 downto 0) := (1 => '1', others => '0');
constant ee_0max : std_logic_vector(BE-1 downto 0) := (others => '1');
constant ee_1max : std_logic_vector(BE-1 downto 0) := (0 => '0', others => '1');
constant ee_2max : std_logic_vector(BE-1 downto 0) := (1 => '0', others => '1');
constant mm_0min : std_logic_vector(BW-1-BE downto 0) := (others => '0');
constant mm_1min : std_logic_vector(BW-1-BE downto 0) := (0 => '1', others => '0');
constant mm_2min : std_logic_vector(BW-1-BE downto 0) := (1 => '1', others => '0');
constant mm_0max : std_logic_vector(BW-1-BE downto 0) := (others => '1');
constant mm_1max : std_logic_vector(BW-1-BE downto 0) := (0 => '0', others => '1');
constant mm_2max : std_logic_vector(BW-1-BE downto 0) := (1 => '0', others => '1');
begin
if true then
report LF & "[TST] Test corner examples =============================" severity note;
-- Conversion from ee_mm to real
tst_conv_r(ee_0max, mm_0min);
tst_conv_r(ee_0max, mm_1min);
tst_conv_r(ee_0max, mm_2min);
tst_conv_r(ee_0max, mm_2max);
tst_conv_r(ee_0max, mm_1max);
tst_conv_r(ee_0max, mm_0max);
tst_conv_r(ee_1max, mm_0min);
tst_conv_r(ee_1max, mm_1min);
tst_conv_r(ee_1max, mm_2min);
tst_conv_r(ee_1max, mm_2max);
tst_conv_r(ee_1max, mm_1max);
tst_conv_r(ee_1max, mm_0max);
tst_conv_r(ee_1min, mm_0min);
tst_conv_r(ee_1min, mm_1min);
tst_conv_r(ee_1min, mm_2min);
tst_conv_r(ee_1min, mm_2max);
tst_conv_r(ee_1min, mm_1max);
tst_conv_r(ee_1min, mm_0max);
tst_conv_r(ee_0min, mm_0min);
tst_conv_r(ee_0min, mm_1min);
tst_conv_r(ee_0min, mm_2min);
tst_conv_r(ee_0min, mm_2max);
tst_conv_r(ee_0min, mm_1max);
tst_conv_r(ee_0min, mm_0max);
end if;
if true then
report LF & "[TST] Test ramp =============================" severity note;
-- Conersion from real to ee_mm
r:= r_ini;
for ki in 0 to N-1 loop
ee_mm := real_to_ufp(r, BW, BE, EO);
ee := ee_mm(BW-1 downto BW-BE);
mm := ee_mm(BW-BE-1 downto 0);
print(r, ee, mm);
r2 := ufp_to_real(ee_mm, BW, BE, EO);
print(r2, ee, mm);
report LF & "[TST] ----------------------------------" severity note;
r := r + r_inc;
end loop; -- ki
end if;
`
wait;
end process;
end architecture tst;

126
hw/beh/wg_mem.vhd Normal file
View file

@ -0,0 +1,126 @@
-- wg_mem
--
-- Generate weights using stream of idx
--
-- Inputs are spike index and location of kernel to read
--
-- Current implementation assumes that all weights are cached
-- and that sizes of and KI and KO are powers of 2
use work.pkg_sbs.all;
entity wg_mem is
generic (
LOG2_H : natural := 2; -- size of H (number of output IPs per
-- output location)
LOG2_KI : natural := 4; -- number IPs per input (thus spike index)
LOG2_KO : natural := 3); -- number connections from IPi
-- block to IPo block (thus,
-- number of output IPs of full connected,
-- kernel size in conv)
port (
clk, rstn : in bit;
-- Initial update
do_init_str : in bit; -- First step in init process
do_init_nxt : in bit; -- Next step in init process
w_init : in real; -- Weight value to update
-- Normal
idx : in bit_vector(LOG2_KI-1 downto 0); -- Index of spike
pos : in bit_vector(LOG2_KO-1 downto 0); -- Location of output (edge, kernel)
ena_idx : in bit;
busy_idx : out bit;
ena_w : out bit; -- Send a weight
w : out real); -- stream of states
end entity wg_mem;
library ieee;
use ieee.numeric_bit.all;
architecture rtl of wg_mem is
signal busy_rg, busy_nxt, i_done, idx_done, pos_done : bit;
signal idx_rg, idx_nxt : unsigned(LOG2_KI-1 downto 0);
signal pos_rg, pos_nxt : unsigned(LOG2_KO-1 downto 0);
signal i_rg, i_nxt : unsigned(LOG2_H-1 downto 0);
-- All these params could be configurable..
constant I_LAST : unsigned(LOG2_H-1 downto 0) := (others=>'1');
constant IDX_LAST : unsigned(LOG2_KI-1 downto 0) := (others=>'1');
constant POS_LAST : unsigned(LOG2_KO-1 downto 0) := (others=>'1');
constant I_ZERO : unsigned(LOG2_H-1 downto 0) := (others=>'0');
constant IDX_ZERO : unsigned(LOG2_KI-1 downto 0) := (others=>'0');
constant POS_ZERO : unsigned(LOG2_KO-1 downto 0) := (others=>'0');
-- Memory
signal mem_addr, mem_addr_nxt : unsigned(LOG2_H+LOG2_KI+LOG2_KO-1 downto 0);
signal mem_wr, mem_rd : bit;
begin -- architecture rtl
busy_idx <= busy_rg;
ena_w <= busy_rg;
i_done <= '1' when i_rg = I_ZERO else '0';
idx_done <= '1' when idx_rg = IDX_ZERO else '0';
pos_done <= '1' when pos_rg = POS_ZERO else '0';
busy_nxt <= '1' when ena_idx='1' else
'0' when i_done='1' else
busy_rg;
i_nxt <= I_LAST when (ena_idx='1') or (do_init_str='1') else
i_nxt-1 when (busy_rg='1') or (do_init_nxt='1') else
i_rg;
idx_nxt <= unsigned(idx) when ena_idx='1' else
IDX_LAST when do_init_str='1' else
idx_nxt-1 when (do_init_nxt='1') and (i_done='1') else
idx_rg;
pos_nxt <= unsigned(pos) when ena_idx='1' else
POS_LAST when do_init_str='1' else
pos_nxt-1 when (do_init_nxt='1') and (idx_done='1') else
pos_rg;
reg: process (clk, rstn) is
begin -- process reg
if rstn = '0' then -- asynchronous reset (active low)
idx_rg <= IDX_LAST;
pos_rg <= POS_LAST;
i_rg <= I_LAST;
busy_rg <= '0';
elsif clk'event and clk = '1' then -- rising clock edge
idx_rg <= idx_nxt;
pos_rg <= pos_nxt;
i_rg <= i_nxt;
busy_rg <= busy_nxt;
end if;
end process reg;
mem_addr_nxt <= pos_nxt & idx_nxt & i_nxt;
mem_wr <= do_init_str or do_init_nxt;
mem_rd <= '1';
-- Implementation of a synchronous single port memory
mem: process (clk) is
constant mem_size : natural := 2**(mem_addr'length);
type mem_ty is array (0 to mem_size-1) of real;
variable w_mem : mem_ty;
begin -- process mem
if clk'event and clk = '1' then -- rising clock edge
mem_addr <= mem_addr_nxt;
if mem_wr='1' then
w_mem(to_integer(mem_addr)) := w_init;
end if;
if mem_rd='1' then
w <= w_mem(to_integer(mem_addr));
end if;
end if;
end process mem;
end architecture rtl;

51
hw/do_sim/ex.py Normal file
View file

@ -0,0 +1,51 @@
import numpy as np
h = np.array([0.1, 0.2, 0.3, 0.0, 0.01, 0.01, 0.1, 0.28])
w0 = [0.3, 0.0, 0.01, 0.01, 0.1, 0.28, 0.1, 0.2]
w1 = [0.01, 0.01, 0.1, 0.28, 0.1, 0.2, 0.3, 0.0]
w2 = [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
w = np.array([w0, w1, w2])
eps = 0.2
h0 = h
hw0 = h0 * w[0]
sum_hw0 = np.sum(hw0)
h1 = (h0 + eps*hw0/sum_hw0) /(1+eps)
hw1 = h1 * w[1]
sum_hw1 = np.sum(hw1)
h2 = (h1 + eps*hw1/sum_hw1) /(1+eps)
hw2 = h2 * w[2]
sum_hw2 = np.sum(hw2)
h3 = (h2 + eps*hw2/sum_hw2) /(1+eps)
hp0 = h0
hpw0 = hp0 * w[0]
sum_hpw0 = np.sum(hpw0)
norm_hp0 = np.sum(hp0)
hp1 = (sum_hpw0 * hp0 + eps * norm_hp0 * hpw0)
hpw1 = hp1 * w[1]
sum_hpw1 = np.sum(hpw1)
norm_hp1 = np.sum(hp1)
hp2 = (sum_hpw1*hp1 + eps * norm_hp1 * hpw1)
hpw2 = hp2 * w[2]
sum_hpw2 = np.sum(hpw2)
norm_hp2 = np.sum(hp2)
hp3 = (sum_hpw2*hp2 + eps * norm_hp2 * hpw2)
# Show that hp are just multiple of h
print(hp1/h1)
print(hp2/h2)
print(hp3/h3)
# This should be the output of HW model
print(hp0, hp1, hp2, hp3)

55
hw/do_sim/ex_ufp.py Normal file
View file

@ -0,0 +1,55 @@
from math import log2, ceil, floor
BW = 5
BE = 3
EO = 1
BM = BW-BE
BO = BM-EO # BW-BE-EO
EXP_MAX = 2**BE-1
R_MIN = ufp_to_r(EXP_MAX, 1, BW, BE, EO)
def r_to_ufp(r, BW, BE, EO):
BM = BW-BE
BO = BM-EO # BW-BE-EO
EXP_MAX = 2**BE-1
R_MIN = ufp_to_r(EXP_MAX, 1, BW, BE, EO)
if r<R_MIN: # Small values
ee = EXP_MAX
mm = 0
return ee, mm
aa = floor(BM - log2(r))
aa = max(min(aa, EXP_MAX+BO), BO)
ee = aa - BO
mm = round(r*2**aa)
return ee, mm
def ufp_to_r(ee, mm, BW, BE, EO):
BM = BW-BE
BO = BM-EO # BW-BE-EO
r = mm * 2**(-BO-ee)
return r
for mm in range(2**BM):
for ee in range(2**BE):
r_exp = ufp_to_r(ee, mm, BW, BE, EO)
ee_exp, mm_exp = r_to_ufp(r_exp, BW, BE, EO)
r_exp2 = ufp_to_r(ee_exp, mm_exp, BW, BE, EO)
#print("r={} r_exp={} mm={} ee={}".format(r, r_exp, mm_exp, ee_exp))
print("mm={} ee={} r={} ".format(mm, ee, r_exp))
print("mm={} ee={} r={} ".format(mm_exp, ee_exp, r_exp2))
print("---------------------------------------------------")
r=1.5625e-2
#r=3.1249999999999997e-2
ee, mm = r_to_ufp(r, BW, BE, EO)
r2 = ufp_to_r(ee, mm, BW, BE, EO)

View file

@ -0,0 +1,11 @@
# Load library of functions...
source ~/SVN/ids_setup/flow/flow_lib/lib_synth/flow_lib_synth.tcl
source ~/SVN/ids_setup/flow/flow_lib/lib_synth/flow_tech_lib_synth.tcl
# Load local library
source cmd/lib_synth.tcl
# Not executed to allow the user to change the defaults
# flow_setup_def
# flow_set_tech tcbn65lptc

View file

@ -0,0 +1,8 @@
# Load local library
# source cmd/lib_synth.tcl
#do_synth_def mua 10 tcbn65lptc clk rst
#do_synth_def mua 1 tcbn40lptc clk rst
do_synth_def hu_dp 0.6 tcbn40lptc clk rst

View file

@ -0,0 +1,31 @@
# the local library for synthesis
proc do_synth { UNIT_NAME { T } { TECH tcbn65lptc } {clk clk} {rst rst} } {
# Set-up the environment and the technology
flow_setup_def
flow_set_tech $TECH
# Analyze and elaborate automatically. Params can be added using -param width=>32,ports=>8. Also possible to use a file
analyze -library work -autoread -recursive ../rtl -top $UNIT_NAME
elaborate -library work $UNIT_NAME
link
check_design
# Set constraints
flow_def_rst $rst
flow_def_clock $T $clk
flow_def_timing [expr $T/8] [expr $T/8]
check_timing
# Synthesize
compile_ultra ;# Run the synthesize
#change_names -rules vhdl -hier -verbose -log_changes ./log/change_names.log
# Write reports
#set prefix ${UNIT_NAME}
set prefix ${UNIT_NAME}_T=${T}_TECH=${TECH} ;# Define prefix to identify reports.
flow_report_all $prefix ;# Write reports
flow_write_netlist $prefix ;# Write results
}

5
hw/do_synth/source.csh Normal file
View file

@ -0,0 +1,5 @@
#source /eda/synopsys/synopsys_lic_init_2015-2016.csh
#source /eda/synopsys/2015-16/scripts/SYN_2015.06-SP4_RHELx86.csh
setenv SNPSLMD_LICENSE_FILE "28231@item0096"
setenv PATH "/usrf01/prog/synopsys/syn/R-2020.09-SP4/bin:${PATH}"

135
hw/rtl/hu_dp.vhd Normal file
View file

@ -0,0 +1,135 @@
-- hu_dp
-- Data path for Update H using stream of weights
-- Trivial fix point implementation
library ieee;
use ieee.std_logic_1164.all;
use work.pkg_sbs.all;
entity hu_dp is
generic (
K : natural := 3; -- additional bits for sum
B : natural := 10); -- bitwidth of input
port (
clk, rstn : in std_logic;
ctr_hu : in std_logic_vector(BW_HU_CTR-1 downto 0); -- Control for data path
loc_h : out std_logic_vector(ADDR_H_MAX-1 downto 0); -- Current location in H
eps : in std_logic_vector(B-1 downto 0);
wi : in std_logic_vector(B-1 downto 0); -- stream of weights
hi : in std_logic_vector(B-1 downto 0); -- stream of state
ho : out std_logic_vector(B-1 downto 0)); -- stream of states
end entity hu_dp;
library ieee;
use ieee.numeric_std.all;
architecture rtl of hu_dp is
-- Memory
subtype word is std_logic_vector(B-1 downto 0);
type array_as_h_w is array (N_H_MAX-1 downto 0) of word;
signal mem_hp : array_as_h_w; -- State (internal)
signal mem_hw : array_as_h_w; -- Copy of w*h
signal addr_wr, addr_nxt : std_logic_vector(ADDR_H_MAX-1 downto 0); -- Address
-- Data path for hp (i.t. h un-normalized) and hw (hp*w)
signal hp_new, hw_nxt : unsigned(2*B-1 downto 0);
signal hp_new_rg, hp_p, h_eff : std_logic_vector(B-1 downto 0);
signal hw_p : std_logic_vector(B-1 downto 0);
-- Accumulators for normalization
signal sum_hw, sum_hw_nxt : std_logic_vector(B-1 downto 0); -- Running sum hw
signal sum_hw_p, sum_hw_p_nxt : std_logic_vector(B-1 downto 0); -- Saved sum hw of previous
signal sum_hp, sum_hp_nxt : std_logic_vector(B-1 downto 0); -- Running sum hp
signal sum_hp_p, sum_hp_p_nxt : std_logic_vector(B-1 downto 0); -- Saved sum hw of previous (normalization)
-- Control signals
signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : std_logic;
signal ctr_addr_rst, ctr_addr_inc, ctr_write_hw : std_logic;
signal ctr_wr_hw, ctr_wr_hp : std_logic;
begin -- architecture rtlf
-- Get control signals
ctr_sel_ini <= ctr_hu(0);
ctr_sum_ini <= ctr_hu(1);
ctr_update_sum <= ctr_hu(2);
ctr_addr_rst <= ctr_hu(3);
ctr_addr_inc <= ctr_hu(4);
ctr_write_hw <= ctr_hu(5);
ctr_wr_hw <= ctr_hu(6);
ctr_wr_hp <= ctr_hu(6);
ctr_update_sum2 <= ctr_hu(7);
-- Main calculation
hp_new <= unsigned(hp_p) * unsigned(sum_hw_p) + unsigned(sum_hp_p) * unsigned(hw_p) ;
-- Mux to select first h or saved one
h_eff <= hi when ctr_sel_ini='1' else hp_new_rg;
-- Calculate hw
hw_nxt <= unsigned(h_eff) * unsigned(wi) ;
-- Output h (note latency of a complete group)
ho <= h_eff;
-- Accumulate hw and hp
sum_hw_nxt <= std_logic_vector(hw_nxt(2*B-1 downto B)) when ctr_sum_ini='1' else std_logic_vector(unsigned(sum_hw) + hw_nxt(2*B-1 downto B));
sum_hw_p_nxt <= (others=>'0') when ctr_update_sum='1' else
sum_hw when ctr_update_sum2='1' else sum_hw_p;
sum_hp_nxt <= h_eff when ctr_sum_ini='1' else std_logic_vector(unsigned(sum_hp) + unsigned(h_eff)); -- Accumulate h
--sum_hp_p_nxt <= sum_hp_nxt * eps when ctr_update_sum='1' else sum_hp_p;
sum_hp_p_nxt <= eps when ctr_update_sum='1' else
std_logic_vector(hp_new(2*B-1 downto B)) when ctr_update_sum2='1' else sum_hp_p;
-- Read from memory
--hw_p <= mem_hw(to_integer(unsigned(addr_nxt)));
hw_p <= mem_hw(to_integer(unsigned(addr_nxt)))
when ctr_update_sum2='0' else sum_hp; -- Put sum_hp in mult
hp_p <= mem_hp(to_integer(unsigned(addr_nxt)));
-- Address calculation
addr_nxt <= (others => '0') when ctr_addr_rst='1' else
std_logic_vector(unsigned(addr_wr) + 1) when ctr_addr_inc='1' else addr_wr;
loc_h <= addr_wr; -- Output for ctrl path
-- Registers
rg: process (clk, rstn) is
begin -- process pipe1
if rstn = '0' then
hp_new_rg <= (others=>'0');
sum_hw <= (others=>'0');
sum_hw_p <= (others=>'0');
sum_hp <= (others=>'0');
sum_hp_p <= (others=>'0');
addr_wr <= (others => '0');
elsif clk'event and clk = '1' then
hp_new_rg <= std_logic_vector(hp_new(2*B-1 downto B));
sum_hw <= sum_hw_nxt;
sum_hw_p <= sum_hw_p_nxt;
sum_hp <= sum_hp_nxt;
sum_hp_p <= sum_hp_p_nxt;
addr_wr <= addr_nxt;
end if;
end process rg;
-- Memory
mem: process (clk) is
begin -- process mem
if clk'event and clk = '1' then -- rising clock edge
if ctr_wr_hw='1' then
mem_hw(to_integer(unsigned(addr_wr))) <= std_logic_vector(hw_nxt(2*B-1 downto B));
end if;
if ctr_wr_hp='1' then
mem_hp(to_integer(unsigned(addr_wr))) <= h_eff;
end if;
end if;
end process mem;
end architecture rtl;

52
hw/rtl/mua.vhd Normal file
View file

@ -0,0 +1,52 @@
-- Simple multiply with adder to check speed
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity mua is
generic (
B : natural := 10); -- bitwidth
port (
clk, arstn : in std_logic;
dt_mv, dt_mc : in std_logic_vector(B-1 downto 0); -- input for multiplicatin
dt_add : in std_logic_vector(2*B-1 downto 0); -- constant to add
dt_mua : out std_logic_vector(2*B-1 downto 0)); -- output
end entity mua;
library ieee;
use ieee.numeric_std.all;
architecture rtl of mua is
signal dt_mv_rg, dt_mc_rg : unsigned(B-1 downto 0);
signal dt_add_rg : unsigned(2*B-1 downto 0);
signal dt_mua_rg, dt_mua_nxt : unsigned(2*B-1 downto 0);
begin -- architecture rtl
dt_mua_nxt <= dt_mv_rg * dt_mc_rg + dt_add_rg;
dt_mua <= std_logic_vector(dt_mua_rg);
reg: process (clk, arstn) is
begin -- process reg
if arstn = '0' then -- asynchronous reset (active low)
dt_mv_rg <= (others=>'0');
dt_mc_rg <= (others=>'0');
dt_add_rg <= (others=>'0');
dt_mua_rg <= (others=>'0');
elsif clk'event and clk = '1' then -- rising clock edge
dt_mv_rg <= unsigned(dt_mv);
dt_mc_rg <= unsigned(dt_mc);
dt_add_rg <= unsigned(dt_add);
dt_mua_rg <= dt_mua_nxt;
end if;
end process reg;
end architecture rtl;

View file

@ -0,0 +1,65 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity mult_unsgn_pp_trunc is
generic (
BWa : natural := 16; -- Bit width of Multiplier
BWb : natural := 16;
K : natural := 15); -- Vertical truncation
port (
da : in std_logic_vector(BWa-1 downto 0);
db : in std_logic_vector(BWb-1 downto 0);
dout : out std_logic_vector(BWa+BWb-1 downto 0));
end mult_unsgn_pp_trunc;
architecture str of mult_unsgn_pp_trunc is
type stlv_array is array (0 to BWa-1) of std_logic_vector(BWa+BWb-2 downto 0);
signal pp : stlv_array;
--signal pp_res : std_logic_vector(BWa+BWb-1 downto 0);
begin -- str
ppGen : process (da, db)
variable ppt : stlv_array;
begin
ppt := (others => (others => '0'));
-- partial products da(i)db(j) EX:
-- da(0)db(3) da(0)db(2) da(0)db(1) da(0)db(0)
-- da(1)db(3) da(1)db(2) da(1)db(1) da(1)db(0)
-- da(2)db(3) da(2)db(2) da(2)db(1) da(2)db(0)
-- da(3)db(3) da(3)db(2) da(3)db(1) da(3)db(0)
for i in 0 to BWa-1 loop
for j in 0 to BWb-1 loop
if (i+j > K-1) then
ppt(i)(i+j) := da(i) and db(j);
end if;
end loop;
end loop;
PP <= ppt;
end process ppGen;
CSA_tree : process (pp)
variable pp_add : std_logic_vector(BWa+BWb-1 downto 0);
begin -- process CSA_tree
for i in 0 to BWa-1 loop
if i = 0 then
pp_add := '0' & pp(0)(BWa+BWb-2 downto 0);
else
pp_add := std_logic_vector(unsigned('0'&pp(i)(BWa+BWb-2 downto 0)) + unsigned(pp_add));
end if;
end loop; -- i
--pp_res <= pp_add;
dout <= pp_add;
end process CSA_tree;
-- dout <= pp_res;
--dout(BWa+BWb-1 downto 16) <= pp_res(BWa+BWb-1 downto 16);
--dout(15 downto 0) <= (others => '0');
end str;

View file

@ -0,0 +1,86 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity two_mult_unsgn_pp_trunc is
generic (
BWa : natural := 16; -- Bit width of Multiplier
BWb : natural := 16;
K : natural := 15); -- Vertical truncation
port (
da : in std_logic_vector(BWa-1 downto 0);
db : in std_logic_vector(BWb-1 downto 0);
dc : in std_logic_vector(BWa-1 downto 0);
dd : in std_logic_vector(BWb-1 downto 0);
dout : out std_logic_vector(BWa+BWb downto 0));
end two_mult_unsgn_pp_trunc;
architecture str of two_mult_unsgn_pp_trunc is
type stlv_array is array (0 to 2*BWa-1) of std_logic_vector(BWa+BWb-2 downto 0);
signal pp : stlv_array;
--signal pp_res : std_logic_vector(BWa+BWb downto 0);
begin -- str
ppGen1 : process (da, db)
variable ppt : stlv_array;
begin
ppt := (others => (others => '0'));
-- partial products da(i)db(j) EX:
-- da(0)db(3) da(0)db(2) da(0)db(1) da(0)db(0)
-- da(1)db(3) da(1)db(2) da(1)db(1) da(1)db(0)
-- da(2)db(3) da(2)db(2) da(2)db(1) da(2)db(0)
-- da(3)db(3) da(3)db(2) da(3)db(1) da(3)db(0)
for i in 0 to BWa-1 loop
for j in 0 to BWb-1 loop
if (i+j > K-1) then
ppt(i)(i+j) := da(i) and db(j);
end if;
end loop;
end loop;
PP(0 to BWa-1) <= ppt(0 to BWa-1);
end process ppGen1;
ppGen2 : process (dc, dd)
variable ppt : stlv_array;
begin
ppt := (others => (others => '0'));
-- partial products dc(i)dd(j) EX:
-- dc(0)dd(3) dc(0)dd(2) dc(0)dd(1) dc(0)dd(0)
-- dc(1)dd(3) dc(1)dd(2) dc(1)dd(1) dc(1)dd(0)
-- dc(2)dd(3) dc(2)dd(2) dc(2)dd(1) dc(2)dd(0)
-- dc(3)dd(3) dc(3)dd(2) dc(3)dd(1) dc(3)dd(0)
for i in 0 to BWa-1 loop
for j in 0 to BWb-1 loop
if (i+j > K-1) then
ppt(i)(i+j) := dc(i) and dd(j);
end if;
end loop;
end loop;
PP(BWa to 2*BWa-1) <= ppt(0 to BWa-1);
end process ppGen2;
CSA_tree : process (pp)
variable pp_add : std_logic_vector(BWa+BWb downto 0);
begin -- process CSA_tree
for i in 0 to 2*BWa-1 loop
if i = 0 then
pp_add := "00" & pp(0)(BWa+BWb-2 downto 0);
else
pp_add := std_logic_vector(unsigned(pp(i)(BWa+BWb-2 downto 0)) + unsigned(pp_add));
end if;
end loop; -- i
--pp_res <= pp_add;
dout <= pp_add;
end process CSA_tree;
-- dout <= pp_res;
--dout(BWa+BWb downto 16) <= pp_res(BWa+BWb-1 downto 16);
--dout(15 downto 0) <= (others => '0');
end str;

29
hw/sbs_hw.org Normal file
View file

@ -0,0 +1,29 @@
#+TITLE: sbs_hw.org
* Idea
Start with HW implementation.
* Modules
hu_ : h update
wg_ : weight generator
sg_ : spikes generator
* HU
This block updates H according to the SbS equations.
- First version done.
- Need to decide if reciprocal or multiplication
* WG
This block generates a stream of weights given a stream of spikes
- instead of stream of spikes scaned per region, it could be better to receive a spike identifier, and
then all the edges (from location to location) that use this particular spike.
* New strategy for read
When doing a convolution, we can read