Add initial code from old version of SbS core

2024-10-20 19:03:44 +02:00 · 2024-10-20 19:03:44 +02:00 · a365b9b01d
commit a365b9b01d
parent 768d16789e
20 changed files with 1332 additions and 0 deletions
--- a/hw/beh/hu.vhd
+++ b/hw/beh/hu.vhd
@ -0,0 +1,57 @@
+-- hu
+-- Update H using stream of weights
+
+use work.pkg_sbs.all;
+
+entity hu is
+  
+  port (
+    clk, rstn : in  bit;
+    cfg_hu     : in bit_vector(BW_HU_CFG -1 downto 0); -- Config
+    ena_w      : in bit;         -- New weight
+    is_ini     : in bit;         -- First vector (get w and h when ena)
+    is_fst     : in bit;         -- Fist component in vector
+    ena_ho     : out bit;        -- Signal a valid ho value
+    wi         : in  real;       -- stream of weights
+    hi         : in  real;       -- stream of state
+    ho         : out real);      -- stream of states
+
+end entity hu;
+
+
+
+architecture rtlf of hu is
+
+  signal ctr_hu    : bit_vector(BW_HU_CTR-1 downto 0);
+  signal loc_h     : bit_vector(ADDR_H_MAX-1 downto 0); 
+  signal eps       : real; 
+
+begin  -- architecture rtlf
+
+  i_hu_dp: entity work.hu_dp
+    port map (
+      clk    => clk,
+      rstn   => rstn,
+      eps    => eps,
+      ctr_hu => ctr_hu,
+      loc_h  => loc_h,
+      wi     => wi,
+      hi     => hi,
+      ho     => ho);
+
+  
+  i_hu_ctr: entity work.hu_ctr
+    port map (
+      clk    => clk,
+      rstn   => rstn,
+      eps    => eps,      
+      cfg_hu => cfg_hu,
+      loc_h  => loc_h,
+      ena_w  => ena_w,
+      is_ini => is_ini,
+      is_fst => is_fst,
+      ena_ho => ena_ho,
+      ctr_hu => ctr_hu);
+  
+
+end architecture rtlf;
--- a/hw/beh/hu_ctr.vhd
+++ b/hw/beh/hu_ctr.vhd
@ -0,0 +1,72 @@
+-- hu_ctr
+-- Control path for Update H using stream of weights
+
+use work.pkg_sbs.all;
+
+entity hu_ctr is
+  port (
+    clk, rstn  : in  bit;
+    cfg_hu     : in bit_vector(BW_HU_CFG -1 downto 0); -- Config
+    ena_w      : in bit;         -- New weight
+    is_ini     : in bit;         -- First vector (get w and h when ena)
+    is_fst     : in bit;         -- Fist component in vector
+    loc_h      : in bit_vector(ADDR_H_MAX-1 downto 0); -- Current location in H
+    ena_ho     : out bit;        -- Signal a valid ho value
+    eps        : out real; 
+    ctr_hu     : out bit_vector(BW_HU_CTR-1 downto 0));   -- Control for data path
+
+end entity hu_ctr;
+
+library ieee;
+use ieee.numeric_bit.all;
+
+architecture beh of hu_ctr is
+
+  signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : bit;
+  signal ctr_addr_rst,  ctr_addr_inc,  ctr_write_hw : bit;
+  signal ctr_wr_hw, ctr_wr_hp : bit;
+
+  -- Number of elements in H (currently fixed)
+  constant MAX_LOC_H  : bit_vector(ADDR_H_MAX-1 downto 0) := bit_vector(to_unsigned(8, ADDR_H_MAX));
+
+  --constant T : time := 10 ns;
+  
+begin  -- architecture beh
+
+  eps <= 0.2;
+
+  ctr_hu(0) <= ctr_sel_ini    ;
+  ctr_hu(1) <= ctr_sum_ini    ;
+  ctr_hu(2) <= ctr_update_sum ;
+  ctr_hu(3) <= ctr_addr_rst   ;
+  ctr_hu(4) <= ctr_addr_inc   ;
+  ctr_hu(5) <= ctr_write_hw   ;
+  ctr_hu(6) <= ctr_wr_hw      ;
+  --ctr_hu(7) <= ctr_wr_hp      ; -- ctr_wr_hp and ctr_wr_hw are the same
+  ctr_hu(7) <=  ctr_update_sum2  ;
+
+  -- Code in first approximation
+  ctr_sel_ini <= is_ini; 
+  ctr_wr_hp <= ena_w; --is_ini;   
+  ctr_wr_hw <= ena_w;
+  ctr_sum_ini <= is_fst; 
+  ctr_update_sum <= transport is_fst  after 7*T ;
+  --ctr_update_sum2 <= transport ctr_update_sum after T;
+  --ctr_update_sum <= '1' when (loc_h = MAX_LOC_H) else '0';
+  
+  ctr_addr_rst <= ctr_update_sum; 
+  ctr_addr_inc <= ena_w and not ctr_addr_rst;
+
+  ena_ho <= ena_w and not is_ini;
+
+  rg: process (clk, rstn) is
+  begin 
+    if rstn = '0' then                 -- asynchronous reset (active low)
+      ctr_update_sum2 <= '0';
+    elsif clk'event and clk = '1' then  -- rising clock edge
+      ctr_update_sum2 <= ctr_update_sum;
+    end if;
+  end process rg;
+  
+  
+end architecture beh;
--- a/hw/beh/hu_dp.vhd
+++ b/hw/beh/hu_dp.vhd
@ -0,0 +1,124 @@
+-- hu_dp
+-- Data path for Update H using stream of weights
+
+use work.pkg_sbs.all;
+
+entity hu_dp is
+  port (
+    clk, rstn : in  bit;
+    ctr_hu     : in bit_vector(BW_HU_CTR-1 downto 0);   -- Control for data path
+    loc_h      : out bit_vector(ADDR_H_MAX-1 downto 0); -- Current location in H
+    eps        : in  real; 
+    wi         : in  real;       -- stream of weights
+    hi         : in  real;       -- stream of state
+    ho         : out real);      -- stream of states
+
+end entity hu_dp;
+
+library ieee;
+use ieee.numeric_bit.all;
+
+architecture rtlf of hu_dp is
+  -- Memory  
+  signal mem_hp : array_as_h;   -- State (internal)
+  signal mem_hw : array_as_h;   -- Copy of w*h
+  signal addr_wr,  addr_nxt : bit_vector(ADDR_H_MAX-1 downto 0); -- Address
+
+  -- Data path for hp (i.t. h un-normalized) and hw (hp*w)
+  signal hp_new, hp_new_rg, hp_p, h_eff : real := 0.0;    
+  signal hw_p, hw_nxt : real := 0.0; 
+  
+  -- Accumulators for normalization
+  signal sum_hw, sum_hw_nxt : real := 0.0;      -- Running sum hw
+  signal sum_hw_p, sum_hw_p_nxt : real := 0.0;  -- Saved sum hw of previous     
+  signal sum_hp, sum_hp_nxt : real := 0.0;      -- Running sum hp 
+  signal sum_hp_p, sum_hp_p_nxt  : real := 0.0; -- Saved sum hw of previous (normalization)
+
+  -- Control signals
+  signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : bit;
+  signal ctr_addr_rst,  ctr_addr_inc,  ctr_write_hw : bit;
+  signal ctr_wr_hw, ctr_wr_hp : bit;
+  
+begin  -- architecture rtlf
+
+  -- Get control signals
+  ctr_sel_ini <= ctr_hu(0);
+  ctr_sum_ini <= ctr_hu(1);
+  ctr_update_sum <= ctr_hu(2);
+  ctr_addr_rst <= ctr_hu(3);  
+  ctr_addr_inc <= ctr_hu(4);
+  ctr_write_hw <= ctr_hu(5);
+  ctr_wr_hw <= ctr_hu(6);
+  ctr_wr_hp <= ctr_hu(6);
+  ctr_update_sum2 <= ctr_hu(7); 
+    
+  
+  -- Main calculation 
+  hp_new <= hp_p * sum_hw_p + sum_hp_p * hw_p ; 
+
+  -- Mux to select first h or saved one
+  h_eff <= hi when ctr_sel_ini='1' else hp_new_rg;  
+
+  -- Calculate hw 
+  hw_nxt <= h_eff * wi ;  
+
+  -- Output h (note latency of a complete group)
+  ho <= h_eff;
+  
+  -- Accumulate hw and hp
+  sum_hw_nxt   <= hw_nxt when ctr_sum_ini='1' else sum_hw + hw_nxt; 
+  sum_hw_p_nxt <=  0.0  when ctr_update_sum='1' else  
+                   sum_hw when ctr_update_sum2='1' else sum_hw_p;
+
+  sum_hp_nxt   <= h_eff when ctr_sum_ini='1' else sum_hp + h_eff; -- Accumulate h
+  --sum_hp_p_nxt <= sum_hp_nxt * eps when ctr_update_sum='1' else sum_hp_p;
+  sum_hp_p_nxt <=  eps when ctr_update_sum='1' else  
+                   hp_new when ctr_update_sum2='1' else sum_hp_p;
+  
+
+  -- Read from memory
+  --hw_p <= mem_hw(to_integer(unsigned(addr_nxt)));
+  hw_p <= mem_hw(to_integer(unsigned(addr_nxt)))
+          when ctr_update_sum2='0' else  sum_hp; -- Put sum_hp in mult 
+  hp_p <= mem_hp(to_integer(unsigned(addr_nxt)));
+
+  -- Address calculation
+  addr_nxt <= (others => '0') when ctr_addr_rst='1' else
+              bit_vector(unsigned(addr_wr) + 1) when ctr_addr_inc='1' else addr_wr;
+  loc_h <= addr_wr; -- Output for ctrl path
+    
+  -- Registers
+  rg: process (clk, rstn) is
+  begin  -- process pipe1
+    if rstn = '0' then         
+      hp_new_rg <= 0.0;
+      sum_hw <= 0.0;
+      sum_hw_p <= 0.0;
+      sum_hp <= 0.0;
+      sum_hp_p <= 0.0;
+      addr_wr <= (others => '0');
+    elsif clk'event and clk = '1' then  
+      hp_new_rg <= hp_new;
+      sum_hw <= sum_hw_nxt;
+      sum_hw_p <= sum_hw_p_nxt;
+      sum_hp <= sum_hp_nxt;
+      sum_hp_p <= sum_hp_p_nxt;
+      addr_wr <= addr_nxt;
+    end if;
+  end process rg;
+
+
+  -- Memory
+  mem: process (clk) is
+  begin  -- process mem
+    if clk'event and clk = '1' then  -- rising clock edge
+      if ctr_wr_hw='1' then
+        mem_hw(to_integer(unsigned(addr_wr))) <= hw_nxt;
+      end if;
+      if ctr_wr_hp='1' then
+        mem_hp(to_integer(unsigned(addr_wr))) <= h_eff;
+      end if;      
+    end if;   
+  end process mem;
+
+end architecture rtlf;
--- a/hw/beh/mem_sync.vhd
+++ b/hw/beh/mem_sync.vhd
@ -0,0 +1,49 @@
+-- Implementation of a synchronous single port memory
+
+library ieee;
+use ieee.numeric_bit.all;
+
+
+entity mem_sync is
+  generic(
+    BA : natural := 7); -- log2 addresses
+  port(
+    clk : in bit;
+    wr, rd : in bit;
+    addr : in bit_vector(BA-1 downto 0);
+    dti : in real;
+    dto : out real);
+end entity mem_sync;
+
+
+library ieee;
+use ieee.numeric_bit.all;
+
+
+architecture beh of mem_sync is
+
+  signal addr_rg : unsigned(BA-1 downto 0);
+
+begin  -- architecture beh
+  
+  mem: process (clk) is
+    constant mem_size : natural := 2**(addr'length);
+    type mem_ty is array (0 to mem_size-1) of real;
+    variable w_mem : mem_ty;
+  begin  -- process mem
+    if clk'event and clk = '1' then  -- rising clock edge
+      addr_rg <= unsigned(addr);      
+      if wr='1' then
+        w_mem(to_integer(addr_rg)) := dti;
+      end if;
+      if rd='1' then
+        dto <= w_mem(to_integer(addr_rg));
+      end if;
+    end if;
+  end process mem;
+
+end architecture beh;
+  
+ -- Local Variables:
+ -- compile-command: "ghdl -a --std=00  --workdir=../do_sim/  mem_sync.vhd"
+ -- End:
--- a/hw/beh/pkg_sbs.vhd
+++ b/hw/beh/pkg_sbs.vhd
@ -0,0 +1,15 @@
+package pkg_sbs is
+
+  constant BW_HU_CTR : natural := 10;    -- bits for control
+  constant BW_HU_CFG : natural := 15;   -- bits for configuration
+
+  constant N_H_MAX : natural := 8;      -- Max size of H 
+  constant ADDR_H_MAX : natural := 3;   -- log2 of N_H_MAX; it is size of addr
+                                        -- bus
+  
+  --subtype hu_ctr is bit_vector 4 downto 0;
+
+  -- Array of reals with max size of H
+  type array_as_h is array (N_H_MAX-1 downto 0) of real;
+  
+end package pkg_sbs;
--- a/hw/beh/pkg_ufp.vhd
+++ b/hw/beh/pkg_ufp.vhd
@ -0,0 +1,99 @@
+-- Library of functions to work with unsigned FP numbers
+library ieee;
+use ieee.std_logic_1164.all;
+
+package pkg_ufp is
+  -- Format 
+  -- [ ee ][ mm ]
+  -- with total BW bits, and BE exponent bits and offset of exponent EO
+  -- b(ee) in EO-[0..2**BE-1]
+  -- b(mm) in [0..2**(BW-BE)-1]/2**(BW-BE)
+
+  -- Convert a number if unsigned floating point to real
+  function ufp_to_real (
+    ee_mm : std_logic_vector;  -- Data in format exponent_mantissa as bits
+    BW : natural;              -- Bit width
+    BE : natural;              -- Number of bits used for exponent
+    EO : natural)              -- Offset of exponent       
+    return real;
+
+  -- Convert a real number to unsigned floating point 
+  function real_to_ufp (
+    r    : real;               -- Real number to convert
+    BW : natural;              -- Bit width
+    BE : natural;              -- Number of bits used for exponent
+    EO : natural)              -- Offset of exponent       
+    return std_logic_vector;
+
+
+  
+end package pkg_ufp;
+
+library ieee;
+use ieee.numeric_std.all;
+use ieee.math_real.all; 
+
+package body pkg_ufp is
+
+  function ufp_to_real (
+    ee_mm : std_logic_vector;  -- Data in format exponent_mantissa as bits
+    BW : natural;              -- Bit width
+    BE : natural;              -- Number of bits used for exponent
+    EO : natural)              -- Offset of exponent       
+    return real
+  is
+    variable mm : unsigned(BW-BE-1 downto 0);
+    variable ee : unsigned(BE-1 downto 0);
+    variable d : real;
+  begin
+    ee := unsigned(ee_mm(BW-1 downto BW-BE));
+    mm := unsigned(ee_mm(BW-BE-1 downto 0));
+    d  := real(to_integer(mm)) * 2.0**real(EO-to_integer(ee)-(BW-BE));
+    return d;
+  end function ufp_to_real;
+    
+
+  function real_to_ufp (
+    r    : real;               -- Real number to convert
+    BW : natural;              -- Bit width
+    BE : natural;              -- Number of bits used for exponent
+    EO : natural)              -- Offset of exponent       
+    return std_logic_vector
+  is
+    variable BO, MAX_EXP, BM : integer;
+    variable R_MIN, BM_LIM : real;
+    variable mm : unsigned(BW-BE-1 downto 0);
+    variable ee : unsigned(BE-1 downto 0);
+    variable aa : integer;  -- Scaling to normalize r into ufp representation
+    variable mm_id : integer;
+  begin
+    BO := BW-BE-EO;   -- Exponent of Scaling factor
+    BM := BW - BE;    -- Bits for mantissa
+    BM_lim := log2(2.0**BM-1.0); -- A bit less than BM
+    MAX_EXP := 2**BE-1;           -- Max exponent
+    R_MIN := 2.0**(-MAX_EXP-BO);  -- Min value (not equal zero)
+    if r<R_MIN then               -- If too small, set to zero
+      ee := (others => '1');
+      mm := (others => '0');
+    else
+      --aa := BM - integer(ceil(log2(r)));
+      aa := integer(floor(BM_LIM - log2(r)));
+      if aa < BO then
+         aa := BO;
+      end if;
+      if aa > MAX_EXP+BO then
+        aa := MAX_EXP+BO;
+      end if;
+      --report "[TST] aa=" & integer'image(aa) severity note;
+      --report "[TST] rr=" & real'image( r * 2.0**aa ) severity note;
+      
+      mm_id := integer(round(r * 2.0**aa));
+      --mm_id := integer(floor(r * 2.0**aa));      -- ????
+      ee := to_unsigned(aa-BO, ee'length);
+      mm := to_unsigned(mm_id, mm'length);
+    end if;
+    return std_logic_vector(ee) & std_logic_vector(mm);    
+  end function real_to_ufp;
+
+  
+end package body pkg_ufp;
--- a/hw/beh/tst_hu.vhd
+++ b/hw/beh/tst_hu.vhd
@ -0,0 +1,126 @@
+-- tst_hu
+-- Testbench for Update H using stream of weights
+
+use work.pkg_sbs.all;
+
+entity tst_hu is
+  
+end entity tst_hu;
+
+
+architecture tst of tst_hu is
+  
+  constant T : time := 10 ns;           -- Period
+  
+  signal clk, rstn : bit := '0';
+  signal cfg_hu    : bit_vector(BW_HU_CFG -1 downto 0);
+  signal ena_w     : bit;
+  signal is_ini    : bit;
+  signal is_fst    : bit;
+  signal ena_ho    : bit;
+  signal wi        : real := 0.0;
+  signal hi        : real := 0.0;
+  signal ho        : real;
+  
+begin  -- architecture tst
+
+  clk <= not clk after T/2;
+  rstn <= '0', '1' after T/2+T/4;
+  
+  i_hu: entity work.hu
+    port map (
+      clk    => clk,
+      rstn   => rstn,
+      cfg_hu => cfg_hu,
+      ena_w  => ena_w,
+      is_ini => is_ini,
+      is_fst => is_fst,
+      ena_ho => ena_ho,
+      wi     => wi,
+      hi     => hi,
+      ho     => ho);
+
+  process (clk) is
+    type array_sol is array (natural range <>) of real;
+
+    -- Example of solution from python
+    constant h_sol : array_sol := (
+      --0.1 , 0.2 , 0.3 , 0.0  , 0.01, 0.01, 0.1 , 0.28,
+      0.01628 , 0.02056 , 0.03144 , 0.0      , 0.001228, 0.001588,    0.01228 , 0.039984,
+      1.28343706e-04, 1.62085171e-04, 3.17669760e-04, 0.00000000e+00, 1.24077120e-05, 1.99630656e-05, 1.84671552e-04, 3.05349811e-04,
+      2.17637063e-08, 2.74853687e-08, 5.38684101e-08, 0.00000000e+00, 2.10402060e-09, 3.38520923e-09, 3.13154230e-08, 5.17792718e-08
+      );
+
+    variable idx : natural;
+    
+  begin  -- process
+    if clk'event and clk = '1' then  -- rising clock edge
+      if ena_ho='1' then
+        if idx<h_sol'length-1 then
+          if abs(ho-h_sol(idx)) > 1.0e-09 then
+             report LF & ESC &  "[31;1m  [ERROR] h_sol= " & real'image(h_sol(idx)) &  ESC & "[0m" & LF severity error;            
+          end if;
+          idx := idx+1;
+        end if;
+        report LF & "[INFO] h_exp= " & real'image(ho) & LF severity note;
+      end if;
+    end if;
+  end process;    
+
+
+  process  is
+    constant h : array_as_h := (0.1, 0.2, 0.3, 0.0, 0.01, 0.01, 0.1, 0.28);
+  begin  -- process    
+    hi <= 0.0;
+    is_ini <= '0';
+    wait for T/4 + T/2 + T;
+
+    for n in 0 to 1 loop
+      for ki in h'range loop
+        hi <= h(ki);
+        is_ini <= '1';        
+        wait for T;
+      end loop;  -- ki
+      is_ini <= '0';        
+      hi <= 0.0;
+      --wait for T*(h'length+1)*3;    -- note +1 for void cycle
+      wait for T*(2+(h'length+2)*3);    -- note +2 for void cycle
+    end loop;  -- n
+    wait;
+  end process;
+
+  
+  process  is
+    constant w0 : array_as_h := (0.3, 0.0, 0.01, 0.01, 0.1, 0.28, 0.1, 0.2);
+    constant w1 : array_as_h := (0.01, 0.01, 0.1, 0.28, 0.1, 0.2, 0.3, 0.0);
+    constant w2 : array_as_h := (0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125);
+
+    type array_w is array (0 to 2) of array_as_h;
+    constant w : array_w := (w0, w1, w2);
+
+  begin  -- process    
+    ena_w <= '0';
+    wi <= 0.0;
+    wait for T/4 + T/2 + T;
+
+    for n in 0 to 1 loop
+      for kj in w'range loop
+        is_fst <= '1', '0' after T;
+        for ki in w0'range loop
+          ena_w <= '1';
+          wi <= w(kj)(ki);
+          wait for T;
+        end loop;  -- ki
+        ena_w <= '0'; -- void cycle
+        wait for 2*T; -- 
+      end loop; -- kj
+    end loop; -- n
+
+    ena_w <= '0';
+    wait for 4*T;
+    report LF & LF & ESC & "[35;1m [TST] End simulation" & ESC & "[0m" & LF severity failure;
+    
+  end process;
+
+  
+end architecture tst;
--- a/hw/beh/tst_pkg_ufp.vhd
+++ b/hw/beh/tst_pkg_ufp.vhd
@ -0,0 +1,136 @@
+use work.pkg_ufp.all;
+
+entity tst_pkg_ufp is
+
+end entity tst_pkg_ufp;
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+architecture tst of tst_pkg_ufp is
+
+
+
+begin  -- architecture tst
+
+  process
+    -- Define params of ufp
+    constant BW : natural := 10;
+    constant BE : natural := 3;
+    constant EO : natural := 1;
+
+    constant BO : natural := BW - BE -EO;
+
+    variable r, r2 : real;
+    variable ee_mm : std_logic_vector(BW-1 downto 0);
+    variable mm : std_logic_vector(BW-1-BE downto 0);
+    variable ee : std_logic_vector(BE-1 downto 0);
+
+    constant r_ini : real := 0.0125;
+    constant r_inc : real := 0.0125/4.0;
+    constant N : natural := 10;
+
+    procedure print (
+      ar : in real;
+      aee : in std_logic_vector(BE-1 downto 0);
+      amm : in std_logic_vector(BW-1-BE downto 0)) is
+    begin
+      report LF & "[TST]  " &
+        "r=" & real'image(ar) & HT & HT &
+        "ufp= " & integer'image(to_integer(unsigned(amm))) &
+        " *2**(-" & integer'image(BO) & "- " & integer'image(to_integer(unsigned(aee))) & " )"
+        severity note;
+    end procedure;
+
+
+    procedure tst_conv_r (
+      aee : in std_logic_vector(BE-1 downto 0);
+      amm : in std_logic_vector(BW-1-BE downto 0)) is
+      variable ar : real;
+      variable aee_mm : std_logic_vector(BW-1 downto 0);
+    begin  -- procedure tst_conv_r
+      aee_mm := aee & amm;
+      ar := ufp_to_real(aee_mm, BW, BE, EO);
+      print(ar, aee, amm);
+
+      aee_mm := real_to_ufp(ar, BW, BE, EO);
+      print(ar, aee, amm);
+      
+      report LF & "[TST]  ----------------------------------" severity note;
+      
+    end procedure tst_conv_r;
+
+    constant ee_0min : std_logic_vector(BE-1 downto 0) := (others => '0');
+    constant ee_1min : std_logic_vector(BE-1 downto 0) := (0 => '1', others => '0');
+    constant ee_2min : std_logic_vector(BE-1 downto 0) := (1 => '1', others => '0');
+    constant ee_0max : std_logic_vector(BE-1 downto 0) := (others => '1');
+    constant ee_1max : std_logic_vector(BE-1 downto 0) := (0 => '0', others => '1');
+    constant ee_2max : std_logic_vector(BE-1 downto 0) := (1 => '0', others => '1');
+
+    constant mm_0min : std_logic_vector(BW-1-BE downto 0) := (others => '0');
+    constant mm_1min : std_logic_vector(BW-1-BE downto 0) := (0 => '1', others => '0');
+    constant mm_2min : std_logic_vector(BW-1-BE downto 0) := (1 => '1', others => '0');
+    constant mm_0max : std_logic_vector(BW-1-BE downto 0) := (others => '1');
+    constant mm_1max : std_logic_vector(BW-1-BE downto 0) := (0 => '0', others => '1');
+    constant mm_2max : std_logic_vector(BW-1-BE downto 0) := (1 => '0', others => '1');
+
+  begin
+    if true then
+      report LF & "[TST] Test corner examples =============================" severity note;
+
+    -- Conversion from ee_mm to real
+    tst_conv_r(ee_0max, mm_0min);
+    tst_conv_r(ee_0max, mm_1min);
+    tst_conv_r(ee_0max, mm_2min);
+    tst_conv_r(ee_0max, mm_2max);
+    tst_conv_r(ee_0max, mm_1max);
+    tst_conv_r(ee_0max, mm_0max);
+
+    tst_conv_r(ee_1max, mm_0min);
+    tst_conv_r(ee_1max, mm_1min);
+    tst_conv_r(ee_1max, mm_2min);
+    tst_conv_r(ee_1max, mm_2max);
+    tst_conv_r(ee_1max, mm_1max);
+    tst_conv_r(ee_1max, mm_0max);
+
+    tst_conv_r(ee_1min, mm_0min);
+    tst_conv_r(ee_1min, mm_1min);
+    tst_conv_r(ee_1min, mm_2min);
+    tst_conv_r(ee_1min, mm_2max);
+    tst_conv_r(ee_1min, mm_1max);
+    tst_conv_r(ee_1min, mm_0max);
+
+    tst_conv_r(ee_0min, mm_0min);
+    tst_conv_r(ee_0min, mm_1min);
+    tst_conv_r(ee_0min, mm_2min);
+    tst_conv_r(ee_0min, mm_2max);
+    tst_conv_r(ee_0min, mm_1max);
+    tst_conv_r(ee_0min, mm_0max);
+
+    end if;
+
+    if true then
+      report LF & "[TST] Test ramp =============================" severity note;
+    -- Conersion from real to ee_mm
+    r:= r_ini;
+    for ki in 0 to N-1 loop
+      ee_mm := real_to_ufp(r, BW, BE, EO);
+
+      ee := ee_mm(BW-1 downto BW-BE);
+      mm := ee_mm(BW-BE-1 downto 0);
+
+      print(r, ee, mm);
+      r2 := ufp_to_real(ee_mm, BW, BE, EO);
+      print(r2, ee, mm);
+      report LF & "[TST]  ----------------------------------" severity note;
+      
+      r := r + r_inc;
+    end loop;  -- ki 
+    end if;
+`
+    wait;
+  end process;
+
+
+end architecture tst;
--- a/hw/beh/wg_mem.vhd
+++ b/hw/beh/wg_mem.vhd
@ -0,0 +1,126 @@
+-- wg_mem
+-- 
+-- Generate weights using stream of idx
+--
+-- Inputs are spike index and location of kernel to read
+--
+-- Current implementation assumes that all weights are cached
+-- and that sizes of and KI and KO are powers of 2
+
+use work.pkg_sbs.all;
+
+entity wg_mem is
+  generic (
+    LOG2_H  : natural := 2;             -- size of H (number of output IPs per
+                                        -- output location)
+    LOG2_KI : natural := 4;             -- number IPs  per input (thus spike index)
+    LOG2_KO : natural := 3);            -- number connections from IPi
+                                        -- block to IPo block (thus,
+                                        -- number of output IPs of full connected,
+                                        -- kernel size in conv)
+  port (
+    clk, rstn : in  bit;
+
+    -- Initial update
+    do_init_str  : in bit;   -- First step in init process
+    do_init_nxt  : in bit;   -- Next step in init process
+    w_init       : in real;  -- Weight value to update
+
+    -- Normal 
+    idx : in bit_vector(LOG2_KI-1 downto 0);  -- Index of spike
+    pos : in bit_vector(LOG2_KO-1 downto 0);  -- Location of output (edge, kernel)
+    ena_idx  : in bit;
+    busy_idx : out bit;
+    ena_w    : out bit;   -- Send a weight
+    w        : out real);  -- stream of states
+
+end entity wg_mem;
+
+library ieee;
+use ieee.numeric_bit.all;
+
+architecture rtl of wg_mem is
+
+  signal busy_rg, busy_nxt, i_done, idx_done, pos_done : bit;
+  signal idx_rg, idx_nxt : unsigned(LOG2_KI-1 downto 0);
+  signal pos_rg, pos_nxt : unsigned(LOG2_KO-1 downto 0);
+  signal i_rg, i_nxt : unsigned(LOG2_H-1 downto 0);
+
+  -- All these params could be configurable..
+  constant I_LAST : unsigned(LOG2_H-1 downto 0) := (others=>'1');
+  constant IDX_LAST : unsigned(LOG2_KI-1 downto 0) := (others=>'1');
+  constant POS_LAST : unsigned(LOG2_KO-1 downto 0) := (others=>'1');
+
+  constant I_ZERO : unsigned(LOG2_H-1 downto 0) := (others=>'0');
+  constant IDX_ZERO : unsigned(LOG2_KI-1 downto 0) := (others=>'0');
+  constant POS_ZERO : unsigned(LOG2_KO-1 downto 0) := (others=>'0');
+
+  -- Memory
+  signal mem_addr, mem_addr_nxt : unsigned(LOG2_H+LOG2_KI+LOG2_KO-1 downto 0);
+  signal mem_wr, mem_rd : bit;
+  
+begin  -- architecture rtl
+
+  busy_idx <= busy_rg;
+  ena_w <= busy_rg;
+  
+  i_done   <= '1' when i_rg = I_ZERO else '0';
+  idx_done <= '1' when idx_rg = IDX_ZERO else '0';
+  pos_done <= '1' when pos_rg = POS_ZERO else '0';
+  
+  busy_nxt <= '1' when ena_idx='1' else
+              '0' when i_done='1' else
+              busy_rg;
+
+  i_nxt <= I_LAST when (ena_idx='1') or (do_init_str='1') else
+           i_nxt-1 when (busy_rg='1') or (do_init_nxt='1') else
+           i_rg;
+
+  idx_nxt <= unsigned(idx) when ena_idx='1' else
+             IDX_LAST when do_init_str='1' else
+             idx_nxt-1 when (do_init_nxt='1') and (i_done='1') else
+             idx_rg;
+
+  pos_nxt <= unsigned(pos) when ena_idx='1' else
+             POS_LAST when do_init_str='1' else
+             pos_nxt-1 when (do_init_nxt='1') and (idx_done='1') else
+             pos_rg;
+  
+  reg: process (clk, rstn) is
+  begin  -- process reg
+    if rstn = '0' then                 -- asynchronous reset (active low)
+      idx_rg <= IDX_LAST;
+      pos_rg <= POS_LAST;
+      i_rg <= I_LAST;
+      busy_rg <= '0';
+    elsif clk'event and clk = '1' then  -- rising clock edge
+      idx_rg <= idx_nxt;
+      pos_rg <= pos_nxt;
+      i_rg <= i_nxt;
+      busy_rg <= busy_nxt;
+    end if;
+  end process reg;
+
+
+  mem_addr_nxt <= pos_nxt & idx_nxt & i_nxt;
+  mem_wr <= do_init_str or do_init_nxt;
+  mem_rd <= '1';
+
+  -- Implementation of a synchronous single port memory 
+  mem: process (clk) is
+    constant mem_size : natural := 2**(mem_addr'length);
+    type mem_ty is array (0 to mem_size-1) of real;
+    variable w_mem : mem_ty;
+  begin  -- process mem
+    if clk'event and clk = '1' then  -- rising clock edge
+      mem_addr <= mem_addr_nxt;      
+      if mem_wr='1' then
+        w_mem(to_integer(mem_addr)) := w_init;
+      end if;
+      if mem_rd='1' then
+        w <= w_mem(to_integer(mem_addr));
+      end if;
+    end if;
+  end process mem;
+
+end architecture rtl;
--- a/hw/do_sim/ex.py
+++ b/hw/do_sim/ex.py
@ -0,0 +1,51 @@
+import numpy as np
+
+h = np.array([0.1, 0.2, 0.3, 0.0, 0.01, 0.01, 0.1, 0.28])
+
+w0 = [0.3, 0.0, 0.01, 0.01, 0.1, 0.28, 0.1, 0.2]
+w1 = [0.01, 0.01, 0.1, 0.28, 0.1, 0.2, 0.3, 0.0]
+w2 = [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
+
+w = np.array([w0, w1, w2])
+
+eps = 0.2
+
+h0 = h
+hw0 = h0 * w[0]
+sum_hw0 = np.sum(hw0)
+
+h1 = (h0 + eps*hw0/sum_hw0) /(1+eps)
+hw1 = h1 * w[1]
+sum_hw1 = np.sum(hw1)
+
+h2 = (h1 + eps*hw1/sum_hw1) /(1+eps)
+hw2 = h2 * w[2]
+sum_hw2 = np.sum(hw2)
+
+h3 = (h2 + eps*hw2/sum_hw2) /(1+eps)
+
+hp0 = h0
+hpw0 = hp0 * w[0]
+sum_hpw0 = np.sum(hpw0)
+norm_hp0 = np.sum(hp0)
+
+hp1 = (sum_hpw0 * hp0 + eps * norm_hp0 * hpw0)
+hpw1 = hp1 * w[1]
+sum_hpw1 = np.sum(hpw1)
+norm_hp1 = np.sum(hp1)
+
+hp2 = (sum_hpw1*hp1 + eps * norm_hp1 * hpw1)
+hpw2 = hp2 * w[2]
+sum_hpw2 = np.sum(hpw2)
+norm_hp2 = np.sum(hp2)
+
+hp3 = (sum_hpw2*hp2 + eps * norm_hp2 * hpw2)
+
+
+# Show that hp are just multiple of h
+print(hp1/h1)
+print(hp2/h2)
+print(hp3/h3)
+
+# This should be the output of HW model
+print(hp0, hp1, hp2, hp3)
--- a/hw/do_sim/ex_ufp.py
+++ b/hw/do_sim/ex_ufp.py
@ -0,0 +1,55 @@
+from math import log2, ceil, floor
+
+BW = 5
+BE = 3
+EO = 1
+
+BM = BW-BE
+BO = BM-EO   # BW-BE-EO
+EXP_MAX = 2**BE-1
+R_MIN = ufp_to_r(EXP_MAX, 1, BW, BE, EO)
+
+
+def r_to_ufp(r, BW, BE, EO):
+    BM = BW-BE
+    BO = BM-EO   # BW-BE-EO
+    EXP_MAX = 2**BE-1
+    R_MIN = ufp_to_r(EXP_MAX, 1, BW, BE, EO)
+
+    if r<R_MIN: # Small values
+        ee = EXP_MAX
+        mm = 0
+        return ee, mm
+        
+    aa = floor(BM - log2(r))
+    aa = max(min(aa, EXP_MAX+BO), BO)
+    ee = aa - BO
+    
+    mm = round(r*2**aa)
+    return ee, mm
+    
+
+def ufp_to_r(ee, mm, BW, BE, EO):
+    BM = BW-BE
+    BO = BM-EO   # BW-BE-EO
+    
+    r = mm * 2**(-BO-ee)
+    return r
+
+
+for mm in range(2**BM):
+    for ee in range(2**BE):
+        r_exp =  ufp_to_r(ee, mm, BW, BE, EO)
+        ee_exp, mm_exp = r_to_ufp(r_exp, BW, BE, EO)
+        r_exp2 = ufp_to_r(ee_exp, mm_exp, BW, BE, EO)
+        #print("r={} r_exp={} mm={} ee={}".format(r, r_exp, mm_exp, ee_exp))
+
+        print("mm={} ee={} r={}  ".format(mm, ee, r_exp))
+        print("mm={} ee={} r={}  ".format(mm_exp, ee_exp, r_exp2))
+        print("---------------------------------------------------")
+
+
+r=1.5625e-2        
+#r=3.1249999999999997e-2
+ee, mm = r_to_ufp(r, BW, BE, EO)
+r2 = ufp_to_r(ee, mm, BW, BE, EO)
--- a/hw/do_synth/.synopsys_dc.setup
+++ b/hw/do_synth/.synopsys_dc.setup
@ -0,0 +1,11 @@
+# Load library of functions...
+source ~/SVN/ids_setup/flow/flow_lib/lib_synth/flow_lib_synth.tcl
+source ~/SVN/ids_setup/flow/flow_lib/lib_synth/flow_tech_lib_synth.tcl
+
+# Load local library
+source cmd/lib_synth.tcl
+
+# Not executed to allow the user to change the defaults 
+# flow_setup_def
+# flow_set_tech tcbn65lptc
+    
--- a/hw/do_synth/cmd/do_synth.tcl
+++ b/hw/do_synth/cmd/do_synth.tcl
@ -0,0 +1,8 @@
+# Load local library
+# source cmd/lib_synth.tcl
+
+#do_synth_def mua 10 tcbn65lptc clk rst
+
+#do_synth_def mua 1 tcbn40lptc clk rst
+
+do_synth_def hu_dp 0.6 tcbn40lptc clk rst
--- a/hw/do_synth/cmd/lib_synth.tcl
+++ b/hw/do_synth/cmd/lib_synth.tcl
@ -0,0 +1,31 @@
+# the local library for synthesis 
+
+proc do_synth { UNIT_NAME  { T }  { TECH tcbn65lptc } {clk clk}  {rst rst}  }  {
+
+    # Set-up the environment and the technology
+    flow_setup_def
+    flow_set_tech $TECH
+
+    # Analyze and elaborate automatically. Params can be added using -param width=>32,ports=>8. Also possible to use a file
+    analyze    -library work -autoread -recursive ../rtl -top $UNIT_NAME
+    elaborate  -library work                                  $UNIT_NAME
+    link
+    check_design
+
+    # Set constraints
+    flow_def_rst     $rst
+    flow_def_clock   $T  $clk 
+    flow_def_timing  [expr $T/8]   [expr $T/8]
+    check_timing
+
+    # Synthesize
+    compile_ultra                       ;# Run the synthesize
+    #change_names -rules vhdl -hier -verbose -log_changes ./log/change_names.log
+
+    # Write reports
+    #set prefix ${UNIT_NAME}
+    set prefix ${UNIT_NAME}_T=${T}_TECH=${TECH}   ;# Define prefix to identify reports. 
+    flow_report_all     $prefix   ;# Write reports
+    flow_write_netlist  $prefix   ;# Write results
+}
+
--- a/hw/do_synth/source.csh
+++ b/hw/do_synth/source.csh
@ -0,0 +1,5 @@
+#source /eda/synopsys/synopsys_lic_init_2015-2016.csh
+#source /eda/synopsys/2015-16/scripts/SYN_2015.06-SP4_RHELx86.csh
+
+setenv SNPSLMD_LICENSE_FILE "28231@item0096"
+setenv PATH "/usrf01/prog/synopsys/syn/R-2020.09-SP4/bin:${PATH}"
--- a/hw/rtl/hu_dp.vhd
+++ b/hw/rtl/hu_dp.vhd
@ -0,0 +1,135 @@
+-- hu_dp
+-- Data path for Update H using stream of weights
+-- Trivial fix point implementation
+library ieee;
+use ieee.std_logic_1164.all;
+use work.pkg_sbs.all;
+
+entity hu_dp is
+  generic (
+    K : natural := 3;                   -- additional bits for sum
+    B : natural := 10);                 -- bitwidth of input
+  port (
+    clk, rstn : in  std_logic;
+    ctr_hu     : in std_logic_vector(BW_HU_CTR-1 downto 0);   -- Control for data path
+    loc_h      : out std_logic_vector(ADDR_H_MAX-1 downto 0); -- Current location in H
+    eps        : in  std_logic_vector(B-1 downto 0); 
+    wi         : in  std_logic_vector(B-1 downto 0);       -- stream of weights
+    hi         : in  std_logic_vector(B-1 downto 0);       -- stream of state
+    ho         : out std_logic_vector(B-1 downto 0));      -- stream of states
+
+end entity hu_dp;
+
+library ieee;
+use ieee.numeric_std.all;
+
+architecture rtl of hu_dp is
+
+-- Memory  
+  subtype word is std_logic_vector(B-1 downto 0);
+  type array_as_h_w is array (N_H_MAX-1 downto 0) of word;
+
+  signal mem_hp : array_as_h_w;   -- State (internal)
+  signal mem_hw : array_as_h_w;   -- Copy of w*h
+  signal addr_wr,  addr_nxt : std_logic_vector(ADDR_H_MAX-1 downto 0); -- Address
+
+  -- Data path for hp (i.t. h un-normalized) and hw (hp*w)
+  signal hp_new, hw_nxt : unsigned(2*B-1 downto 0);    
+
+  signal hp_new_rg, hp_p, h_eff : std_logic_vector(B-1 downto 0);    
+  signal hw_p  : std_logic_vector(B-1 downto 0); 
+  
+  -- Accumulators for normalization
+  signal sum_hw, sum_hw_nxt : std_logic_vector(B-1 downto 0);      -- Running sum hw
+  signal sum_hw_p, sum_hw_p_nxt : std_logic_vector(B-1 downto 0);  -- Saved sum hw of previous     
+  signal sum_hp, sum_hp_nxt : std_logic_vector(B-1 downto 0);      -- Running sum hp 
+  signal sum_hp_p, sum_hp_p_nxt  : std_logic_vector(B-1 downto 0); -- Saved sum hw of previous (normalization)
+
+  -- Control signals
+  signal ctr_sel_ini, ctr_sum_ini, ctr_update_sum, ctr_update_sum2 : std_logic;
+  signal ctr_addr_rst,  ctr_addr_inc,  ctr_write_hw : std_logic;
+  signal ctr_wr_hw, ctr_wr_hp : std_logic;
+  
+begin  -- architecture rtlf
+
+  -- Get control signals
+  ctr_sel_ini <= ctr_hu(0);
+  ctr_sum_ini <= ctr_hu(1);
+  ctr_update_sum <= ctr_hu(2);
+  ctr_addr_rst <= ctr_hu(3);  
+  ctr_addr_inc <= ctr_hu(4);
+  ctr_write_hw <= ctr_hu(5);
+  ctr_wr_hw <= ctr_hu(6);
+  ctr_wr_hp <= ctr_hu(6);
+  ctr_update_sum2 <= ctr_hu(7); 
+    
+  
+  -- Main calculation 
+  hp_new <= unsigned(hp_p) * unsigned(sum_hw_p) + unsigned(sum_hp_p) * unsigned(hw_p) ; 
+
+  -- Mux to select first h or saved one
+  h_eff <= hi when ctr_sel_ini='1' else hp_new_rg;  
+
+  -- Calculate hw 
+  hw_nxt <= unsigned(h_eff) * unsigned(wi) ;  
+
+  -- Output h (note latency of a complete group)
+  ho <= h_eff;
+  
+  -- Accumulate hw and hp
+  sum_hw_nxt   <= std_logic_vector(hw_nxt(2*B-1 downto B)) when ctr_sum_ini='1' else std_logic_vector(unsigned(sum_hw) + hw_nxt(2*B-1 downto B)); 
+  sum_hw_p_nxt <=  (others=>'0')  when ctr_update_sum='1' else  
+                   sum_hw when ctr_update_sum2='1' else sum_hw_p;
+
+  sum_hp_nxt   <= h_eff when ctr_sum_ini='1' else std_logic_vector(unsigned(sum_hp) + unsigned(h_eff)); -- Accumulate h
+  --sum_hp_p_nxt <= sum_hp_nxt * eps when ctr_update_sum='1' else sum_hp_p;
+  sum_hp_p_nxt <=  eps when ctr_update_sum='1' else  
+                   std_logic_vector(hp_new(2*B-1 downto B)) when ctr_update_sum2='1' else sum_hp_p;
+  
+
+  -- Read from memory
+  --hw_p <= mem_hw(to_integer(unsigned(addr_nxt)));
+  hw_p <= mem_hw(to_integer(unsigned(addr_nxt)))
+          when ctr_update_sum2='0' else  sum_hp; -- Put sum_hp in mult 
+  hp_p <= mem_hp(to_integer(unsigned(addr_nxt)));
+
+  -- Address calculation
+  addr_nxt <= (others => '0') when ctr_addr_rst='1' else
+              std_logic_vector(unsigned(addr_wr) + 1) when ctr_addr_inc='1' else addr_wr;
+  loc_h <= addr_wr; -- Output for ctrl path
+    
+  -- Registers
+  rg: process (clk, rstn) is
+  begin  -- process pipe1
+    if rstn = '0' then         
+      hp_new_rg <= (others=>'0');
+      sum_hw <= (others=>'0');
+      sum_hw_p <= (others=>'0');
+      sum_hp <= (others=>'0');
+      sum_hp_p <= (others=>'0');
+      addr_wr <= (others => '0');
+    elsif clk'event and clk = '1' then  
+      hp_new_rg <= std_logic_vector(hp_new(2*B-1 downto B));
+      sum_hw <= sum_hw_nxt;
+      sum_hw_p <= sum_hw_p_nxt;
+      sum_hp <= sum_hp_nxt;
+      sum_hp_p <= sum_hp_p_nxt;
+      addr_wr <= addr_nxt;
+    end if;
+  end process rg;
+
+
+  -- Memory
+  mem: process (clk) is
+  begin  -- process mem
+    if clk'event and clk = '1' then  -- rising clock edge
+      if ctr_wr_hw='1' then
+        mem_hw(to_integer(unsigned(addr_wr))) <= std_logic_vector(hw_nxt(2*B-1 downto B));
+      end if;
+      if ctr_wr_hp='1' then
+        mem_hp(to_integer(unsigned(addr_wr))) <= h_eff;
+      end if;      
+    end if;   
+  end process mem;
+  
+end architecture rtl;
--- a/hw/rtl/mua.vhd
+++ b/hw/rtl/mua.vhd
@ -0,0 +1,52 @@
+-- Simple multiply with adder to check speed
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+
+entity mua is
+  
+  generic (
+    B : natural := 10);                 -- bitwidth
+
+  port (
+    clk, arstn   : in std_logic;        
+    dt_mv, dt_mc : in  std_logic_vector(B-1 downto 0);   -- input for multiplicatin
+    dt_add       : in  std_logic_vector(2*B-1 downto 0);   -- constant to add
+    dt_mua       : out std_logic_vector(2*B-1 downto 0));  -- output
+
+end entity mua;
+
+
+library ieee;
+use ieee.numeric_std.all;
+
+architecture rtl of mua is
+
+  signal dt_mv_rg, dt_mc_rg      : unsigned(B-1 downto 0);
+  signal dt_add_rg               : unsigned(2*B-1 downto 0);
+  signal dt_mua_rg, dt_mua_nxt   : unsigned(2*B-1 downto 0);
+  
+begin  -- architecture rtl
+
+  dt_mua_nxt <= dt_mv_rg * dt_mc_rg + dt_add_rg;
+
+  dt_mua <= std_logic_vector(dt_mua_rg);
+  
+  reg: process (clk, arstn) is
+  begin  -- process reg
+    if arstn = '0' then                 -- asynchronous reset (active low)
+        dt_mv_rg <= (others=>'0');
+        dt_mc_rg <= (others=>'0');
+        dt_add_rg <= (others=>'0');
+        dt_mua_rg <= (others=>'0');
+    elsif clk'event and clk = '1' then  -- rising clock edge
+        dt_mv_rg <= unsigned(dt_mv);
+        dt_mc_rg <= unsigned(dt_mc);
+        dt_add_rg <= unsigned(dt_add);
+        dt_mua_rg <= dt_mua_nxt;
+    end if;
+  end process reg;
+
+end architecture rtl;
--- a/hw/rtl/mult_unsgn_pp_trunc.vhd
+++ b/hw/rtl/mult_unsgn_pp_trunc.vhd
@ -0,0 +1,65 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+entity mult_unsgn_pp_trunc is
+
+  generic (
+    BWa : natural := 16;                -- Bit width of Multiplier
+    BWb : natural := 16;
+    K   : natural := 15);  -- Vertical truncation                                                                                                                         
+
+  port (
+    da   : in  std_logic_vector(BWa-1 downto 0);
+    db   : in  std_logic_vector(BWb-1 downto 0);
+    dout : out std_logic_vector(BWa+BWb-1 downto 0));
+
+end mult_unsgn_pp_trunc;
+
+architecture str of mult_unsgn_pp_trunc is
+
+  type stlv_array is array (0 to BWa-1) of std_logic_vector(BWa+BWb-2 downto 0);
+  signal pp : stlv_array;
+  --signal pp_res : std_logic_vector(BWa+BWb-1 downto 0);
+
+begin  -- str                                                                                                                                                                          
+
+  ppGen : process (da, db)
+    variable ppt : stlv_array;
+  begin
+    ppt := (others => (others => '0'));
+    -- partial products da(i)db(j) EX:
+    --                                     da(0)db(3) da(0)db(2) da(0)db(1) da(0)db(0)
+    --                          da(1)db(3) da(1)db(2) da(1)db(1) da(1)db(0)
+    --               da(2)db(3) da(2)db(2) da(2)db(1) da(2)db(0)
+    --    da(3)db(3) da(3)db(2) da(3)db(1) da(3)db(0)
+    for i in 0 to BWa-1 loop
+      for j in 0 to BWb-1 loop
+        if (i+j > K-1) then
+          ppt(i)(i+j) := da(i) and db(j);
+        end if;
+      end loop;
+    end loop;
+    PP <= ppt;
+  end process ppGen;
+
+
+  CSA_tree : process (pp)
+    variable pp_add : std_logic_vector(BWa+BWb-1 downto 0);
+  begin  -- process CSA_tree
+    for i in 0 to BWa-1 loop
+      if i = 0 then
+        pp_add := '0' & pp(0)(BWa+BWb-2 downto 0);
+      else
+        pp_add := std_logic_vector(unsigned('0'&pp(i)(BWa+BWb-2 downto 0)) + unsigned(pp_add));
+      end if;
+    end loop;  -- i 
+    --pp_res <= pp_add;
+    dout <= pp_add;
+  end process CSA_tree;
+
+  --  dout <= pp_res;
+  --dout(BWa+BWb-1 downto 16) <= pp_res(BWa+BWb-1 downto 16);
+  --dout(15 downto 0)         <= (others => '0');
+
+end str;
--- a/hw/rtl/two_mult_unsgn_pp_trunc.vhd
+++ b/hw/rtl/two_mult_unsgn_pp_trunc.vhd
@ -0,0 +1,86 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+entity two_mult_unsgn_pp_trunc is
+
+  generic (
+    BWa : natural := 16;                -- Bit width of Multiplier
+    BWb : natural := 16;
+    K   : natural := 15);  -- Vertical truncation                                                                                                                         
+
+  port (
+    da : in std_logic_vector(BWa-1 downto 0);
+    db : in std_logic_vector(BWb-1 downto 0);
+    dc : in std_logic_vector(BWa-1 downto 0);
+    dd : in std_logic_vector(BWb-1 downto 0);
+    dout : out std_logic_vector(BWa+BWb downto 0));
+
+end two_mult_unsgn_pp_trunc;
+
+architecture str of two_mult_unsgn_pp_trunc is
+
+  type stlv_array is array (0 to 2*BWa-1) of std_logic_vector(BWa+BWb-2 downto 0);
+  signal pp : stlv_array;
+  --signal pp_res : std_logic_vector(BWa+BWb downto 0);
+
+begin  -- str                                                                                                                                                                          
+
+  ppGen1 : process (da, db)
+    variable ppt : stlv_array;
+  begin
+    ppt := (others => (others => '0'));
+    -- partial products da(i)db(j) EX:
+    --                                     da(0)db(3) da(0)db(2) da(0)db(1) da(0)db(0)
+    --                          da(1)db(3) da(1)db(2) da(1)db(1) da(1)db(0)
+    --               da(2)db(3) da(2)db(2) da(2)db(1) da(2)db(0)
+    --    da(3)db(3) da(3)db(2) da(3)db(1) da(3)db(0)
+    for i in 0 to BWa-1 loop
+      for j in 0 to BWb-1 loop
+        if (i+j > K-1) then
+          ppt(i)(i+j) := da(i) and db(j);
+        end if;
+      end loop;
+    end loop;
+    PP(0 to BWa-1) <= ppt(0 to BWa-1);
+  end process ppGen1;
+
+  ppGen2 : process (dc, dd)
+    variable ppt : stlv_array;
+  begin
+    ppt := (others => (others => '0'));
+    -- partial products dc(i)dd(j) EX:
+    --                                     dc(0)dd(3) dc(0)dd(2) dc(0)dd(1) dc(0)dd(0)
+    --                          dc(1)dd(3) dc(1)dd(2) dc(1)dd(1) dc(1)dd(0)
+    --               dc(2)dd(3) dc(2)dd(2) dc(2)dd(1) dc(2)dd(0)
+    --    dc(3)dd(3) dc(3)dd(2) dc(3)dd(1) dc(3)dd(0)
+    for i in 0 to BWa-1 loop
+      for j in 0 to BWb-1 loop
+        if (i+j > K-1) then
+          ppt(i)(i+j) := dc(i) and dd(j);
+        end if;
+      end loop;
+    end loop;
+    PP(BWa to 2*BWa-1) <= ppt(0 to BWa-1);
+  end process ppGen2;
+
+
+  CSA_tree : process (pp)
+    variable pp_add : std_logic_vector(BWa+BWb downto 0);
+  begin  -- process CSA_tree
+    for i in 0 to 2*BWa-1 loop
+      if i = 0 then
+        pp_add := "00" & pp(0)(BWa+BWb-2 downto 0);
+      else
+        pp_add := std_logic_vector(unsigned(pp(i)(BWa+BWb-2 downto 0)) + unsigned(pp_add));
+      end if;
+    end loop;  -- i 
+    --pp_res <= pp_add;
+    dout <= pp_add;
+  end process CSA_tree;
+
+  --  dout <= pp_res;
+  --dout(BWa+BWb downto 16) <= pp_res(BWa+BWb-1 downto 16);
+  --dout(15 downto 0)         <= (others => '0');
+
+end str;
--- a/hw/sbs_hw.org
+++ b/hw/sbs_hw.org
@ -0,0 +1,29 @@
+#+TITLE: sbs_hw.org
+
+* Idea
+  Start with HW implementation.
+
+* Modules
+  hu_   : h update
+  wg_   : weight generator
+  sg_   : spikes generator
+
+* HU 
+This block updates H according to the SbS equations.
+- First version done. 
+- Need to decide if reciprocal or multiplication
+
+
+* WG
+This block generates a stream of weights given a stream of spikes
+- instead of stream of spikes scaned per region, it could be better to receive a spike identifier, and
+  then all the edges (from location to location) that use this particular spike.
+
+  
+
+
+* New strategy for read
+
+When doing a convolution, we can read
+
+