Add files via upload

2025-07-01 15:00:01 +02:00 · 2023-01-05 13:23:58 +01:00 · 2023-01-05 13:23:58 +01:00 · 81ec320f5c
commit 81ec320f5c
parent 4426f05e61
77 changed files with 8266 additions and 0 deletions
--- a/DATA_FASHION_MNIST/convert.py
+++ b/DATA_FASHION_MNIST/convert.py
@ -0,0 +1,161 @@
 # MIT License
 # Copyright 2022 University of Bremen
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included
 # in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 # THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 #
 # David Rotermund ( davrot@uni-bremen.de )
 #
 #
 # Release history:
 # ================
 # 1.0.0 -- 01.05.2022: first release
 #
 #
 import numpy as np
 # [offset] [type]          [value]          [description]
 # 0000     32 bit integer  0x00000801(2049) magic number (MSB first)
 # 0004     32 bit integer  60000            number of items
 # 0008     unsigned byte   ??               label
 # 0009     unsigned byte   ??               label
 # ........
 # xxxx     unsigned byte   ??               label
 # The labels values are 0 to 9.
 class ReadLabel:
    """Class for reading the labels from an MNIST label file"""
    def __init__(self, filename):
        self.filename: str = filename
        self.data = self.read_from_file(filename)
    def read_from_file(self, filename):
        int32_data = np.dtype(np.uint32)
        int32_data = int32_data.newbyteorder(">")
        file = open(filename, "rb")
        magic_flag = np.frombuffer(file.read(4), int32_data)[0]
        if magic_flag != 2049:
            data = np.zeros(0)
            number_of_elements = 0
        else:
            number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_elements < 1:
            data = np.zeros(0)
        else:
            data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8)
        file.close()
        return data
 # [offset] [type]          [value]          [description]
 # 0000     32 bit integer  0x00000803(2051) magic number
 # 0004     32 bit integer  60000            number of images
 # 0008     32 bit integer  28               number of rows
 # 0012     32 bit integer  28               number of columns
 # 0016     unsigned byte   ??               pixel
 # 0017     unsigned byte   ??               pixel
 # ........
 # xxxx     unsigned byte   ??               pixel
 # Pixels are organized row-wise.
 # Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
 class ReadPicture:
    """Class for reading the images from an MNIST image file"""
    def __init__(self, filename):
        self.filename: str = filename
        self.data = self.read_from_file(filename)
    def read_from_file(self, filename):
        int32_data = np.dtype(np.uint32)
        int32_data = int32_data.newbyteorder(">")
        file = open(filename, "rb")
        magic_flag = np.frombuffer(file.read(4), int32_data)[0]
        if magic_flag != 2051:
            data = np.zeros(0)
            number_of_elements = 0
        else:
            number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_elements < 1:
            data = np.zeros(0)
            number_of_rows = 0
        else:
            number_of_rows = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_rows != 28:
            data = np.zeros(0)
            number_of_columns = 0
        else:
            number_of_columns = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_columns != 28:
            data = np.zeros(0)
        else:
            data = np.frombuffer(
                file.read(number_of_elements * number_of_rows * number_of_columns),
                dtype=np.uint8,
            )
            data = data.reshape(number_of_elements, number_of_columns, number_of_rows)
        file.close()
        return data
 def proprocess_data_set(test_mode):
    if test_mode is True:
        filename_out_pattern: str = "TestPatternStorage.npy"
        filename_out_label: str = "TestLabelStorage.npy"
        filename_in_image: str = "t10k-images-idx3-ubyte"
        filename_in_label = "t10k-labels-idx1-ubyte"
    else:
        filename_out_pattern = "TrainPatternStorage.npy"
        filename_out_label = "TrainLabelStorage.npy"
        filename_in_image = "train-images-idx3-ubyte"
        filename_in_label = "train-labels-idx1-ubyte"
    pictures = ReadPicture(filename_in_image)
    labels = ReadLabel(filename_in_label)
    # Down to 0 ... 1.0
    max_value = np.max(pictures.data.astype(np.float32))
    d = np.float32(pictures.data.astype(np.float32) / max_value)
    label_storage = np.uint64(labels.data)
    pattern_storage = d.astype(np.float32)
    np.save(filename_out_pattern, pattern_storage)
    np.save(filename_out_label, label_storage)
 proprocess_data_set(True)
 proprocess_data_set(False)
--- a/DATA_FASHION_MNIST/data_url.txt
+++ b/DATA_FASHION_MNIST/data_url.txt
@ -0,0 +1,8 @@
 https://github.com/zalandoresearch/fashion-mnist
 We need:
 t10k-images-idx3-ubyte.gz  t10k-labels-idx1-ubyte.gz  train-images-idx3-ubyte.gz  train-labels-idx1-ubyte.gz
 Then
 gzip -d *.gz 
 python convert.py
--- a/clean.sh
+++ b/clean.sh
@ -0,0 +1,10 @@
 read -p "Are you sure? " -n 1 -r
 echo    # (optional) move to a new line
 if [[ ! $REPLY =~ ^[Yy]$ ]]
 then
    exit 1
 fi
 rm -rf Log
 rm -rf Parameters
 rm *.txt
--- a/dataset.json
+++ b/dataset.json
@ -0,0 +1,4 @@
 {
    "data_path": "./DATA_FASHION_MNIST/",
    "data_mode": "MNIST_FASHION"
 }
--- a/def.json
+++ b/def.json
@ -0,0 +1,24 @@
 {
    "epoch_id_max": 200,
    "number_of_spikes": [
        1600
    ],
    "batch_size": 24,
    "forgetting_offset": 0.0,
    "stage_id": 0,
    "simulation_id": 0,
    "learning_parameters": {
        "learning_rate_threshold_w": 0.001,
        "eps_xy_intitial": 1.0,
        "number_of_batches_for_one_update": 20,
        "learning_rate_gamma_w": 0.001,
        "lr_scheduler_patience_w": 50,
        "adapt_learning_rate_after_minibatch": true,
        "w_trainable": [
            true
        ]
    },
    "augmentation": {},
    "image_statistics": {},
    "approximation_setting": {}
 }
--- a/def_sbs_L0.json
+++ b/def_sbs_L0.json
@ -0,0 +1,24 @@
 {
    "epoch_id_max": 200,
    "number_of_spikes": [
        1600
    ],
    "batch_size": 24,
    "forgetting_offset": 0.0,
    "stage_id": 0,
    "simulation_id": 0,
    "learning_parameters": {
        "learning_rate_threshold_w": 0.001,
        "eps_xy_intitial": 1.0,
        "number_of_batches_for_one_update": 20,
        "learning_rate_gamma_w": 1.0,
        "lr_scheduler_patience_w": 50,
        "adapt_learning_rate_after_minibatch": true,
        "w_trainable": [
            true
        ]
    },
    "augmentation": {},
    "image_statistics": {},
    "approximation_setting": {}
 }
--- a/get_perf.py
+++ b/get_perf.py
@ -0,0 +1,47 @@
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 import matplotlib.pyplot as plt
 from tensorboard.backend.event_processing import event_accumulator
 import numpy as np
 import json
 from jsmin import jsmin
 import glob
 # -------------------------------
 filename:str = "def.json"
 with open(filename) as json_file:
    minified = jsmin(json_file.read())
 data = json.loads(minified)
 number_of_spikes = data["number_of_spikes"]
 # -------------------------------
 path_runs: str = "./Log/*"  
 temp = glob.glob(path_runs)
 assert len(temp) == 1
 path = temp[0]
 acc = event_accumulator.EventAccumulator(path)
 acc.Reload()
 available_scalar = acc.Tags()["scalars"]
 available_histograms = acc.Tags()["histograms"]
 which_scalar = "Test Error"
 te = acc.Scalars(which_scalar)
 temp = []
 for te_item in te:
    temp.append((te_item[1], te_item[2]))
 temp = np.array(temp)
 print(temp)
 np.save(f"test_error_{number_of_spikes}.npy", temp)
--- a/learn_it.py
+++ b/learn_it.py
@ -0,0 +1,201 @@
 # %%
 import os
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 import sys
 import torch
 import dataconf
 import logging
 from datetime import datetime
 from network.Parameter import Config
 from network.build_network import build_network
 from network.build_optimizer import build_optimizer
 from network.build_lr_scheduler import build_lr_scheduler
 from network.build_datasets import build_datasets
 from network.load_previous_weights import load_previous_weights
 from network.loop_train_test import loop_test, loop_train, run_lr_scheduler
 from torch.utils.tensorboard import SummaryWriter
 # ######################################################################
 # We want to log what is going on into a file and screen
 # ######################################################################
 now = datetime.now()
 dt_string_filename = now.strftime("%Y_%m_%d_%H_%M_%S")
 logging.basicConfig(
    filename="log_" + dt_string_filename + ".txt",
    filemode="w",
    level=logging.INFO,
    format="%(asctime)s %(message)s",
 )
 logging.getLogger().addHandler(logging.StreamHandler())
 # ######################################################################
 # Load the config data from the json file
 # ######################################################################
 if len(sys.argv) < 2:
    raise Exception("Argument: Config file name is missing")
 filename: str = sys.argv[1]
 if os.path.exists(filename) is False:
    raise Exception(f"Config file not found! {filename}")
 if os.path.exists("network.json") is False:
    raise Exception("Config file not found! network.json")
 if os.path.exists("dataset.json") is False:
    raise Exception("Config file not found! dataset.json")
 cfg = dataconf.multi.file("network.json").file("dataset.json").file(filename).on(Config)
 logging.info(cfg)
 logging.info(f"Using configuration file: {filename}")
 logging.info(f"Number of spikes: {cfg.number_of_spikes}")
 logging.info(f"Cooldown after spikes: {cfg.cooldown_after_number_of_spikes}")
 logging.info(f"Reduction cooldown: {cfg.reduction_cooldown}")
 logging.info("")
 logging.info(f"Epsilon 0: {cfg.epsilon_0}")
 logging.info(f"Batch size: {cfg.batch_size}")
 logging.info(f"Data mode: {cfg.data_mode}")
 logging.info("")
 logging.info("*** Config loaded.")
 logging.info("")
 tb = SummaryWriter(log_dir=cfg.log_path)
 # ###########################################
 # GPU Yes / NO ?
 # ###########################################
 default_dtype = torch.float32
 torch.set_default_dtype(default_dtype)
 torch_device: str = "cuda:0" if torch.cuda.is_available() else "cpu"
 use_gpu: bool = True if torch.cuda.is_available() else False
 print(f"Using {torch_device} device")
 device = torch.device(torch_device)
 # ######################################################################
 # Prepare the test and training data
 # ######################################################################
 the_dataset_train, the_dataset_test, my_loader_test, my_loader_train = build_datasets(
    cfg
 )
 logging.info("*** Data loaded.")
 # ######################################################################
 # Build the network, Optimizer, and LR Scheduler                                                                   #
 # ######################################################################
 network = build_network(
    cfg=cfg, device=device, default_dtype=default_dtype, logging=logging
 )
 logging.info("")
 optimizer = build_optimizer(network=network, cfg=cfg, logging=logging)
 lr_scheduler = build_lr_scheduler(optimizer=optimizer, cfg=cfg, logging=logging)
 logging.info("*** Network generated.")
 load_previous_weights(
    network=network,
    overload_path=cfg.learning_parameters.overload_path,
    logging=logging,
    device=device,
    default_dtype=default_dtype,
 )
 logging.info("")
 last_test_performance: float = -1.0
 with torch.no_grad():
    if cfg.learning_parameters.learning_active is True:
        while cfg.epoch_id < cfg.epoch_id_max:
            # ##############################################
            # Run a training data epoch
            # ##############################################
            network.train()
            (
                my_loss_for_batch,
                performance_for_batch,
                full_loss,
                full_correct,
            ) = loop_train(
                cfg=cfg,
                network=network,
                my_loader_train=my_loader_train,
                the_dataset_train=the_dataset_train,
                optimizer=optimizer,
                device=device,
                default_dtype=default_dtype,
                logging=logging,
                tb=tb,
                adapt_learning_rate=cfg.learning_parameters.adapt_learning_rate_after_minibatch,
                lr_scheduler=lr_scheduler,
                last_test_performance=last_test_performance,
            )
            # Let the torch learning rate scheduler update the
            # learning rates of the optimiers
            if cfg.learning_parameters.adapt_learning_rate_after_minibatch is False:
                run_lr_scheduler(
                    cfg=cfg,
                    lr_scheduler=lr_scheduler,
                    optimizer=optimizer,
                    performance_for_batch=performance_for_batch,
                    my_loss_for_batch=my_loss_for_batch,
                    tb=tb,
                    logging=logging,
                )
            # ##############################################
            # Run test data
            # ##############################################
            network.eval()
            last_test_performance = loop_test(
                epoch_id=cfg.epoch_id,
                cfg=cfg,
                network=network,
                my_loader_test=my_loader_test,
                the_dataset_test=the_dataset_test,
                device=device,
                default_dtype=default_dtype,
                logging=logging,
                tb=tb,
            )
            # Next epoch
            cfg.epoch_id += 1
    else:
        # ##############################################
        # Run test data
        # ##############################################
        network.eval()
        last_test_performance = loop_test(
            epoch_id=cfg.epoch_id,
            cfg=cfg,
            network=network,
            my_loader_test=my_loader_test,
            the_dataset_test=the_dataset_test,
            device=device,
            default_dtype=default_dtype,
            logging=logging,
            tb=tb,
        )
    tb.close()
 # %%
--- a/make_new_previous.sh
+++ b/make_new_previous.sh
@ -0,0 +1,5 @@
 MAX_NUMBER=199
 MyPATH="Previous++"
 mkdir $MyPATH
 cp Parameters/*_S$MAX_NUMBER.npy $MyPATH
--- a/network.json
+++ b/network.json
@ -0,0 +1,91 @@
 {
    "network_structure": {
        "number_of_output_neurons": 10,
        "layer_type": [
            "SbS",
            "MAX POOLING",
            "SbS",
            "MAX POOLING",
            "SbS",
            "SbS"
        ],
        "strides": [
            [
                1,
                1
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                1,
                1
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                1,
                1
            ], // "SbS"
            [
                1,
                1
            ] // "SbS"
        ],
        "forward_neuron_numbers": [
            [
                1,
                32
            ], // "SbS"
            [
                32,
                32
            ], // POOLING
            [
                32,
                64
            ], // "SbS"
            [
                64,
                64
            ], // POOLING
            [
                64,
                96
            ], // "SbS"
            [
                96,
                10
            ] // "SbS"
        ],
        "forward_kernel_size": [
            [
                5,
                5
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                5,
                5
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                3,
                3
            ], // "SbS"
            [
                1,
                1
            ] // "SbS"
        ]
    }
 }
--- a/network/Adam.py
+++ b/network/Adam.py
@ -0,0 +1,154 @@
 import torch
 import math
 class Adam(torch.optim.Optimizer):
    sbs_setting: list[bool]
    lr: float
    beta1: float
    beta2: float
    eps: float
    maximize: bool
    def __init__(
        self,
        params,
        sbs_setting: list[bool],
        lr: float = 1e-3,
        beta1: float = 0.9,
        beta2: float = 0.999,
        eps: float = 1e-8,
        maximize: bool = False,
    ) -> None:
        assert lr > 0.0
        assert eps > 0.0
        assert beta1 > 0.0
        assert beta1 < 1.0
        assert beta2 > 0.0
        assert beta2 < 1.0
        assert len(sbs_setting) == len(params)
        self.sbs_setting = sbs_setting
        self.params = params
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.maximize = maximize
        defaults = dict(
            lr=lr,
            beta1=beta1,
            beta2=beta2,
            eps=eps,
            maximize=maximize,
        )
        super().__init__(params, defaults)
    def step(self):
        params_with_grad = []
        grads = []
        exp_avgs = []
        exp_avg_sqs = []
        state_steps = []
        sbs_setting = []
        for id, p in enumerate(self.params):
            if p.grad is not None:
                params_with_grad.append(p)
                grads.append(p.grad)
                sbs_setting.append(self.sbs_setting[id])
                state = self.state[p]
                # Lazy state initialization
                if len(state) == 0:
                    state["step"] = torch.tensor(0.0)
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                exp_avgs.append(state["exp_avg"])
                exp_avg_sqs.append(state["exp_avg_sq"])
                state_steps.append(state["step"])
        self.adam(
            params_with_grad,
            grads,
            sbs_setting,
            exp_avgs,
            exp_avg_sqs,
            state_steps,
            beta1=self.beta1,
            beta2=self.beta2,
            lr=self.lr,
            eps=self.eps,
            maximize=self.maximize,
        )
    def adam(
        self,
        params: list[torch.Tensor],
        grads: list[torch.Tensor],
        sbs_setting: list[bool],
        exp_avgs: list[torch.Tensor],
        exp_avg_sqs: list[torch.Tensor],
        state_steps: list[torch.Tensor],
        beta1: float,
        beta2: float,
        lr: float,
        eps: float,
        maximize: bool,
    ) -> None:
        with torch.no_grad():
            for i, param in enumerate(params):
                if maximize is False:
                    grad = grads[i]
                else:
                    grad = -grads[i]
                exp_avg = exp_avgs[i]
                exp_avg_sq = exp_avg_sqs[i]
                step_t = state_steps[i]
                # increase step
                step_t += 1
                # Decay the first and second moment running average coefficient
                exp_avg *= beta1
                exp_avg += (1.0 - beta1) * grad
                exp_avg_sq *= beta2
                exp_avg_sq += (1.0 - beta2) * grad**2
                step_size: float = lr / (1.0 - beta1 ** float(step_t))
                denom = (
                    exp_avg_sq.sqrt() / math.sqrt(1.0 - beta2 ** float(step_t))
                ) + eps
                if sbs_setting[i] is False:
                    param -= step_size * (exp_avg / denom)
                else:
                    delta = torch.exp(-step_size * (exp_avg / denom))
                    print(
                        f"{float(delta.min()) - 1.0:.4e}  {float(delta.max()) - 1.0:.4e}"
                    )
                    param *= delta
--- a/network/CPP_Cuda/HDynamicCNNManyIP.cu
+++ b/network/CPP_Cuda/HDynamicCNNManyIP.cu
@ -0,0 +1,714 @@
 #include "HDynamicCNNManyIP.h"
 #include <omp.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <cassert>
 #include <iostream>
 HDynamicCNNManyIP::HDynamicCNNManyIP()
 {
 };
 HDynamicCNNManyIP::~HDynamicCNNManyIP()
 {
 };
 bool HDynamicCNNManyIP::update_entrypoint(
    int64_t h_pointer_addr,
    int64_t h_dim_0,
    int64_t h_dim_1,
    int64_t h_dim_2,
    int64_t h_dim_3,
    int64_t epsilon_xy_pointer_addr,
    int64_t epsilon_xy_dim_0,
    int64_t epsilon_xy_dim_1,
    int64_t epsilon_xy_dim_2,
    int64_t epsilon_t_pointer_addr,
    int64_t epsilon_t_dim_0,
    int64_t weights_pointer_addr,
    int64_t weights_dim_0,
    int64_t weights_dim_1,
    int64_t input_pointer_addr,
    int64_t input_dim_0,
    int64_t input_dim_1,
    int64_t input_dim_2,
    int64_t input_dim_3,
    int64_t init_vector_pointer_addr,
    int64_t init_vector_dim_0,
    int64_t number_of_processes,
    float forgetting_offset,
    int64_t gpu_tuning_factor)
 {
    size_t number_of_pattern = input_dim_0;
    size_t h_dim = init_vector_dim_0;
    float* h_init_ptr = (float*)init_vector_pointer_addr;
    assert((h_init_ptr != nullptr));
    assert((h_dim > 0));
    float* h_pointer = (float*)h_pointer_addr;
    assert((h_pointer != nullptr));
    assert((h_dim_0 > 0));
    assert((h_dim_1 > 0));
    assert((h_dim_2 > 0));
    assert((h_dim_3 > 0));
    size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3;
    size_t h_dim_c1 = h_dim_2 * h_dim_3;
    size_t h_dim_c2 = h_dim_3;
    float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr;
    assert((epsilon_xy_pointer != nullptr));
    assert((epsilon_xy_dim_0 > 0));
    assert((epsilon_xy_dim_1 > 0));
    size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1;
    size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2;
    float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr;
    assert((epsilon_t_pointer != nullptr));
    assert((epsilon_t_dim_0 > 0));
    float* weights_pointer = (float*)weights_pointer_addr;
    assert((weights_pointer != nullptr));
    assert((weights_dim_0 > 0));
    assert((weights_dim_1 > 0));
    size_t weights_dim_c0 = weights_dim_1;
    int64_t* input_pointer = (int64_t*)input_pointer_addr;
    assert((input_pointer != nullptr));
    assert((input_dim_0 > 0));
    assert((input_dim_1 > 0));
    assert((input_dim_2 > 0));
    assert((input_dim_3 > 0));
    size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
    size_t input_dim_c1 = input_dim_2 * input_dim_3;
    size_t input_dim_c2 = input_dim_3;
    assert((h_dim == weights_dim_1));
    size_t number_of_spikes = input_dim_1;
    size_t dim_x = input_dim_2;
    size_t dim_y = input_dim_3;
    float forgetting_offset_local = forgetting_offset / static_cast<float>(h_dim);
    // --------------------
    if (number_of_processes > 0)
    {
        omp_set_num_threads(number_of_processes);
        size_t pattern_id;
 #pragma omp parallel for
        for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
        {
            update(
                h_init_ptr,
                h_pointer,
                h_dim_c0,
                h_dim_c1,
                h_dim_c2,
                h_dim,
                epsilon_xy_pointer,
                epsilon_xy_dim_c0,
                epsilon_xy_dim_c1,
                epsilon_t_pointer,
                weights_pointer,
                weights_dim_c0,
                input_pointer,
                input_dim_c0,
                input_dim_c1,
                input_dim_c2,
                number_of_spikes,
                dim_x,
                dim_y,
                forgetting_offset,
                forgetting_offset_local,
                pattern_id);
        }
    }
    else
    {
        gpu_update(
            h_init_ptr,
            h_pointer,
            h_dim_c0,
            h_dim_c1,
            h_dim_c2,
            h_dim,
            epsilon_xy_pointer,
            epsilon_xy_dim_c0,
            epsilon_xy_dim_c1,
            epsilon_t_pointer,
            weights_pointer,
            weights_dim_c0,
            input_pointer,
            input_dim_c0,
            input_dim_c1,
            input_dim_c2,
            number_of_spikes,
            dim_x,
            dim_y,
            forgetting_offset,
            forgetting_offset_local,
            number_of_pattern,
            gpu_tuning_factor);
    }
    return true;
 };
 bool HDynamicCNNManyIP::update(
    float* h_init_ptr,
    float* h_pointer,
    size_t h_dim_c0,
    size_t h_dim_c1,
    size_t h_dim_c2,
    size_t h_dim,
    float* epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    size_t epsilon_xy_dim_c1,
    float* epsilon_t_pointer,
    float* weights_pointer,
    size_t weights_dim_c0,
    int64_t* input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    size_t number_of_spikes,
    size_t dim_x,
    size_t dim_y,
    float forgetting_offset,
    float forgetting_offset_local,
    size_t pattern_id)
 {
    float* h_ptr;
    float* epsilon_xy_ptr;
    int64_t* input_ptr;
    size_t counter_x;
    size_t counter_y;
    for (counter_x = 0; counter_x < dim_x; counter_x++)
    {
        for (counter_y = 0; counter_y < dim_y; counter_y++)
        {
            epsilon_xy_ptr = epsilon_xy_pointer +
                counter_x * epsilon_xy_dim_c1 + counter_y;
            h_ptr = h_pointer +
                pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y;
            input_ptr = input_pointer +
                pattern_id * input_dim_c0 + counter_x * input_dim_c2 + counter_y;
            update_one_ip(
                h_init_ptr,
                h_ptr,
                h_dim_c1,
                h_dim,
                weights_pointer,
                weights_dim_c0,
                input_ptr,
                input_dim_c1,
                epsilon_xy_ptr,
                epsilon_xy_dim_c0,
                epsilon_t_pointer,
                number_of_spikes,
                forgetting_offset,
                forgetting_offset_local);
        }
    }
    return true;
 };
 void HDynamicCNNManyIP::update_one_ip(
    float* h_init_ptr,
    float* h_pointer,
    size_t h_dim_c1,
    size_t h_dim,
    float* weights_pointer,
    size_t weights_dim_c0,
    int64_t* input_pointer,
    size_t input_dim_c1,
    float* epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    float* epsilon_t_pointer,
    size_t number_of_spikes,
    float forgetting_offset,
    float forgetting_offset_local)
 {
    float* h_temp = new float[h_dim];
    float* h_subsegment = new float[h_dim];
    memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim);
    size_t counter_spike;
    size_t counter;
    float h_temp_sum;
    float temp_value;
    float epsilon_subsegment;
    float epsilon_scale = 1.0;
    int64_t* spike;
    float* w_ptr;
    for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++)
    {
        if (epsilon_scale > 1E10)
        {
            temp_value = 1.0 / epsilon_scale;
 #pragma omp simd
            for (counter = 0; counter < h_dim; counter++)
            {
                h_subsegment[counter] *= temp_value;
            }
            epsilon_scale = 1.0;
        }
        spike = input_pointer + counter_spike * input_dim_c1;
        if (*spike >= 0)
        {
            epsilon_subsegment =
                epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike];
            w_ptr = weights_pointer + *spike * weights_dim_c0;
            memcpy(h_temp, h_subsegment, sizeof(float) * h_dim);
 #pragma omp simd
            for (counter = 0; counter < h_dim; counter++)
            {
                h_temp[counter] *= w_ptr[counter];
            }
            h_temp_sum = 0.0;
 #pragma omp simd reduction(+ : h_temp_sum)
            for (counter = 0; counter < h_dim; counter++)
            {
                h_temp_sum += h_temp[counter];
            }
            if (h_temp_sum > 1E-10)
            {
                temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
 #pragma omp simd
                for (counter = 0; counter < h_dim; counter++)
                {
                    h_temp[counter] *= temp_value;
                }
 #pragma omp simd
                for (counter = 0; counter < h_dim; counter++)
                {
                    h_subsegment[counter] += h_temp[counter];
                }
                if (forgetting_offset_local > 0.0)
                {
                    temp_value =
                        epsilon_scale * epsilon_subsegment * forgetting_offset_local;
 #pragma omp simd
                    for (counter = 0; counter < h_dim; counter++)
                    {
                        h_subsegment[counter] += temp_value;
                    }
                    epsilon_scale *=
                        1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
                }
                else
                {
                    epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
                }
            }
        }
    }
    temp_value = 1.0 / epsilon_scale;
 #pragma omp simd
    for (counter = 0; counter < h_dim; counter++)
    {
        h_pointer[counter * h_dim_c1] =
            h_subsegment[counter] * temp_value;
    }
    delete[] h_temp;
    delete[] h_subsegment;
    return;
 };
 __device__ void gpu_update_one_ip(
    float* __restrict__ h_init_ptr,
    float* __restrict__ h_pointer,
    size_t h_dim_c1,
    size_t h_dim,
    float* __restrict__ weights_pointer,
    size_t weights_dim_c0,
    int64_t* input_pointer,
    size_t input_dim_c1,
    float* __restrict__ epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    float* __restrict__ epsilon_t_pointer,
    size_t number_of_spikes,
    float forgetting_offset,
    float forgetting_offset_local,
    float* __restrict__ h_temp,
    float* __restrict__ h_subsegment
 )
 {
    size_t counter_spike;
    size_t counter;
    float h_temp_sum;
    float temp_value;
    float epsilon_subsegment;
    float epsilon_scale = 1.0;
    int64_t* spike;
    float* w_ptr;
    // float* h_temp = new float[h_dim];
    // float* h_subsegment = new float[h_dim];
    // Initialize the sub-segement
    for (counter = 0; counter < h_dim; counter++)
    {
        h_subsegment[counter] = h_init_ptr[counter];
    }
    for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++)
    {
        if (epsilon_scale > 1E10)
        {
            temp_value = 1.0 / epsilon_scale;
            for (counter = 0; counter < h_dim; counter++)
            {
                h_subsegment[counter] *= temp_value;
            }
            epsilon_scale = 1.0;
        }
        spike = input_pointer + counter_spike * input_dim_c1;
        if (*spike >= 0)
        {
            epsilon_subsegment =
                epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike];
            w_ptr = weights_pointer + *spike * weights_dim_c0;
            for (counter = 0; counter < h_dim; counter++)
            {
                h_temp[counter] = h_subsegment[counter] * w_ptr[counter];
            }
            h_temp_sum = 0.0;
            for (counter = 0; counter < h_dim; counter++)
            {
                h_temp_sum += h_temp[counter];
            }
            if (h_temp_sum > 1E-10)
            {
                temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
                for (counter = 0; counter < h_dim; counter++)
                {
                    h_temp[counter] *= temp_value;
                }
                for (counter = 0; counter < h_dim; counter++)
                {
                    h_subsegment[counter] += h_temp[counter];
                }
                if (forgetting_offset_local > 0.0)
                {
                    temp_value =
                        epsilon_scale * epsilon_subsegment * forgetting_offset_local;
                    for (counter = 0; counter < h_dim; counter++)
                    {
                        h_subsegment[counter] += temp_value;
                    }
                    epsilon_scale *=
                        1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
                }
                else
                {
                    epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
                }
            }
        }
    }
    temp_value = 1.0 / epsilon_scale;
    for (counter = 0; counter < h_dim; counter++)
    {
        h_pointer[counter * h_dim_c1] =
            h_subsegment[counter] * temp_value;
    }
    // delete[] h_temp;
    // delete[] h_subsegment;
    return;
 };
 __global__ void kernel_spike_generation(
    float* __restrict__ h_init_ptr,
    float* __restrict__ h_pointer,
    size_t h_dim_c0,
    size_t h_dim_c1,
    size_t h_dim_c2,
    size_t h_dim,
    float* __restrict__ weights_pointer,
    size_t weights_dim_c0,
    int64_t* __restrict__ input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    float* __restrict__ epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    size_t epsilon_xy_dim_c1,
    float* __restrict__ epsilon_t_pointer,
    size_t number_of_spikes,
    float forgetting_offset,
    float forgetting_offset_local,
    size_t dim_x,
    size_t dim_y,
    size_t dim_xy,
    size_t max_threadable_tasks,
    float* __restrict__ temp_memory_a,
    float* __restrict__ temp_memory_b
 )
 {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < max_threadable_tasks)
    {
        float* h_ptr;
        float* epsilon_xy_ptr;
        int64_t* input_ptr;
        float* temp_memory_ptr_a = temp_memory_a + idx * h_dim;
        float* temp_memory_ptr_b = temp_memory_b + idx * h_dim;
        // int pattern_id = idx; 
        int pattern_id = idx / dim_xy;
        int position_xy = idx - (pattern_id * dim_xy);
        // size_t position_x = blockIdx.y;
        // size_t position_y = blockIdx.z;
        size_t position_x = position_xy / dim_y;
        size_t position_y = position_xy - (position_x * dim_y);
        epsilon_xy_ptr = epsilon_xy_pointer +
            position_x * epsilon_xy_dim_c1 + position_y;
        h_ptr = h_pointer +
            pattern_id * h_dim_c0 + position_x * h_dim_c2 + position_y;
        input_ptr = input_pointer +
            pattern_id * input_dim_c0 + position_x * input_dim_c2 + position_y;
        gpu_update_one_ip(
            h_init_ptr,
            h_ptr,
            h_dim_c1,
            h_dim,
            weights_pointer,
            weights_dim_c0,
            input_ptr,
            input_dim_c1,
            epsilon_xy_ptr,
            epsilon_xy_dim_c0,
            epsilon_t_pointer,
            number_of_spikes,
            forgetting_offset,
            forgetting_offset_local,
            temp_memory_ptr_a,
            temp_memory_ptr_b
        );
    }
 };
 // Let's face it... We need a better way to paralelize it...
 bool HDynamicCNNManyIP::gpu_update(
    float* h_init_ptr,
    float* h_pointer,
    size_t h_dim_c0,
    size_t h_dim_c1,
    size_t h_dim_c2,
    size_t h_dim,
    float* epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    size_t epsilon_xy_dim_c1,
    float* epsilon_t_pointer,
    float* weights_pointer,
    size_t weights_dim_c0,
    int64_t* input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    size_t number_of_spikes,
    size_t dim_x,
    size_t dim_y,
    float forgetting_offset,
    float forgetting_offset_local,
    size_t number_of_pattern,
    size_t gpu_tuning_factor)
 {
    cudaError_t status;
    assert((dim_x < 65535));
    assert((dim_y < 65535));
    // // //////////////////////////////////////
    // // Get infos about the device
    // // //////////////////////////////////////
    // int device;
    // cudaDeviceProp prop;
    // status = cudaGetDevice(&device);
    // assert((status == cudaSuccess));
    // // std::cout << "Device ID: " << device << std::endl;
    // status = cudaGetDeviceProperties(&prop, device);
    // assert((status == cudaSuccess));
    // // std::cout << "Device name: " << prop.name << std::endl;
    // int _cuda_heap_size_in_mb = 16;
    // status = cudaDeviceSetLimit(cudaLimitMallocHeapSize, _cuda_heap_size_in_mb * (1 << 20));
    // assert((status == cudaSuccess));
    // size_t pValue;
    // cudaDeviceGetLimit(&pValue, cudaLimitMallocHeapSize);
    // std::cout << pValue << " " << (pValue/(2*4*h_dim)) << std::endl;
    // exit(1);
    // //////////////////////////////////////
    // Calculate the distribution on the GPU
    // //////////////////////////////////////
    int min_grid_size;
    int block_size;
    int grid_size;
    size_t dynamic_s_mem_size = 0;
    size_t max_threadable_tasks = number_of_pattern * dim_x * dim_y;
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator
    status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
        (void*)kernel_spike_generation,
        dynamic_s_mem_size, max_threadable_tasks);
    assert((status == cudaSuccess));
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
    // Maximum dimensionality of grid of thread blocks: 3
    // Maximum x -dimension of a grid of thread blocks: (2^31)-1
    // Maximum y- or z-dimension of a grid of thread blocks: 65535
    // Reduce the automatic block size with our guess 
    if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < block_size))
    {
        block_size = int(gpu_tuning_factor);
    }
    // Round up according to array size
    // (I will separate x and y into other grid dimentsions soon)
    // grid_size = (number_of_pattern + block_size - 1) / block_size;
    grid_size = (max_threadable_tasks + block_size - 1) / block_size;
    // std::cout << min_grid_size << std::endl;
    // std::cout << grid_size << std::endl;
    // std::cout << block_size << std::endl;
    // std::cout << max_threadable_tasks << std::endl;
    //dim3 grid(grid_size, dim_x, dim_y);
    float* temp_memory_a = nullptr;
    status = cudaMalloc((void**)&temp_memory_a, h_dim * max_threadable_tasks * sizeof(float));
    assert((status == cudaSuccess));
    float* temp_memory_b = nullptr;
    status = cudaMalloc((void**)&temp_memory_b, h_dim * max_threadable_tasks * sizeof(float));
    assert((status == cudaSuccess));
    //kernel_spike_generation<<<grid, block_size >>>(
    kernel_spike_generation<<<grid_size, block_size >>>(
        h_init_ptr,
        h_pointer,
        h_dim_c0,
        h_dim_c1,
        h_dim_c2,
        h_dim,
        weights_pointer,
        weights_dim_c0,
        input_pointer,
        input_dim_c0,
        input_dim_c1,
        input_dim_c2,
        epsilon_xy_pointer,
        epsilon_xy_dim_c0,
        epsilon_xy_dim_c1,
        epsilon_t_pointer,
        number_of_spikes,
        forgetting_offset,
        forgetting_offset_local,
        dim_x,
        dim_y,
        (dim_x * dim_y),
        //number_of_pattern
        max_threadable_tasks,
        temp_memory_a,
        temp_memory_b
        );
    status = cudaDeviceSynchronize();
    assert((status == cudaSuccess));
    status = cudaFree(temp_memory_a);
    assert((status == cudaSuccess));
    status = cudaFree(temp_memory_b);
    assert((status == cudaSuccess));
    return true;
 };
--- a/network/CPP_Cuda/HDynamicCNNManyIP.h
+++ b/network/CPP_Cuda/HDynamicCNNManyIP.h
@ -0,0 +1,111 @@
 #ifndef SRC_HDYNAMICCNNMANYIP_H_
 #define SRC_HDYNAMICCNNMANYIP_H_
 #include <unistd.h>
 #include <cctype>
 #include <iostream>
 class HDynamicCNNManyIP
 {
    public:
    HDynamicCNNManyIP();
    ~HDynamicCNNManyIP();
    bool update_entrypoint(
        int64_t h_pointer_addr,
        int64_t h_dim_0,
        int64_t h_dim_1,
        int64_t h_dim_2,
        int64_t h_dim_3,
        int64_t epsilon_xy_pointer_addr,
        int64_t epsilon_xy_dim_0,
        int64_t epsilon_xy_dim_1,
        int64_t epsilon_xy_dim_2,
        int64_t epsilon_t_pointer_addr,
        int64_t epsilon_t_dim_0,
        int64_t weights_pointer_addr,
        int64_t weights_dim_0,
        int64_t weights_dim_1,
        int64_t input_pointer_addr,
        int64_t input_dim_0,
        int64_t input_dim_1,
        int64_t input_dim_2,
        int64_t input_dim_3,
        int64_t init_vector_pointer_addr,
        int64_t init_vector_dim_0,
        int64_t number_of_processes,
        float forgetting_offset,
        int64_t gpu_tuning_factor);
    private:
    bool update(
        float* h_init_ptr,
        float* h_pointer,
        size_t h_dim_c0,
        size_t h_dim_c1,
        size_t h_dim_c2,
        size_t h_dim,
        float* epsilon_xy_pointer,
        size_t epsilon_xy_dim_c0,
        size_t epsilon_xy_dim_c1,
        float* epsilon_t_pointer,
        float* weights_pointer,
        size_t weights_dim_c0,
        int64_t* input_pointer,
        size_t input_dim_c0,
        size_t input_dim_c1,
        size_t input_dim_c2,
        size_t number_of_spikes,
        size_t dim_x,
        size_t dim_y,
        float forgetting_offset,
        float forgetting_offset_local,
        size_t pattern_id);
    void update_one_ip(
        float* h_init_ptr,
        float* h_pointer,
        size_t h_dim_c1,
        size_t h_dim,
        float* weights_pointer,
        size_t weights_dim_c0,
        int64_t* input_pointer,
        size_t input_dim_c1,
        float* epsilon_xy_pointer,
        size_t epsilon_xy_dim_c0,
        float* epsilon_t_pointer,
        size_t number_of_spikes,
        float forgetting_offset,
        float forgetting_offset_local);
    bool gpu_update(
        float* h_init_ptr,
        float* h_pointer,
        size_t h_dim_c0,
        size_t h_dim_c1,
        size_t h_dim_c2,
        size_t h_dim,
        float* epsilon_xy_pointer,
        size_t epsilon_xy_dim_c0,
        size_t epsilon_xy_dim_c1,
        float* epsilon_t_pointer,
        float* weights_pointer,
        size_t weights_dim_c0,
        int64_t* input_pointer,
        size_t input_dim_c0,
        size_t input_dim_c1,
        size_t input_dim_c2,
        size_t number_of_spikes,
        size_t dim_x,
        size_t dim_y,
        float forgetting_offset,
        float forgetting_offset_local,
        size_t number_of_pattern,
        size_t gpu_tuning_factor);
 };
 #endif /* SRC_HDYNAMICCNNMANYIP_H_ */
--- a/network/CPP_Cuda/Makefile
+++ b/network/CPP_Cuda/Makefile
@ -0,0 +1,67 @@
 # Change to your python bin directory (tested with Python 3.10.4)
 PYBIN=~/P3.10GPU/bin/
 NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler
 CC=/usr/lib64/ccache/clang++
 PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes`
 PARAMETERS_O= -O3 -std=c++14 $(PYBIND11INCLUDE) -ccbin=$(CC) \
                                -Xcompiler "-fPIC -Wall -fopenmp=libomp"
 PARAMETERS_Linker=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall"
 PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix`
 all: PyHDynamicCNNManyIP \
 		PySpikeGeneration2DManyIP \
 		PyMultiApp 
 #######################
 HDynamicCNNManyIP.o: HDynamicCNNManyIP.h HDynamicCNNManyIP.cu 
 	$(NVCC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cu -o HDynamicCNNManyIP.o
 PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp 
 	$(NVCC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o
 PyHDynamicCNNManyIP: HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
 	$(NVCC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
 	cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX)
 	$(PYBIN)python3 pybind11_auto_pyi.py
 #######################
 SpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h SpikeGeneration2DManyIP.cu
 	$(NVCC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cu -o SpikeGeneration2DManyIP.o
 PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp 
 	$(NVCC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o
 PySpikeGeneration2DManyIP: SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
 	$(NVCC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
 	cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX)
 	$(PYBIN)python3 pybind11_auto_pyi.py
 #######################
 MultiApp.o: MultiApp.h MultiApp.cu approximation_multiplication_function.cpp \
                gpu_approximation_multiplication_function.cu error_term.cpp gpu_error_term.cu
 	$(NVCC) $(PARAMETERS_O) -c MultiApp.cu -o MultiApp.o
 PyMultiApp.o: MultiApp.h PyMultiApp.cpp
 	$(NVCC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o
 PyMultiApp: MultiApp.o PyMultiApp.o
 	$(NVCC) $(PARAMETERS_Linker)  -o PyMultiApp MultiApp.o PyMultiApp.o
 	cp PyMultiApp PyMultiApp$(PYPOSTFIX)
 	$(PYBIN)python3 pybind11_auto_pyi.py
 #######################
 clean:
 	rm -f PyHDynamicCNNManyIP
 	rm -f PySpikeGeneration2DManyIP
 	rm -f PyMultiApp
 	rm -f *.o
 	rm -f *.so
--- a/network/CPP_Cuda/MultiApp.cu
+++ b/network/CPP_Cuda/MultiApp.cu
@ -0,0 +1,313 @@
 #include "MultiApp.h"
 #include <omp.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include <vector>
 #include "approximation_multiplication_function.cpp"
 #include "gpu_approximation_multiplication_function.cu"
 MultiApp::MultiApp()
 {
 };
 MultiApp::~MultiApp()
 {
 };
 bool MultiApp::update(float* np_input_pointer,
    float* np_weight_pointer,
    float* np_output_pointer, int64_t pattern_dim,
    int64_t feature_dim, int64_t x_dim, int64_t y_dim,
    int64_t input_channel_dim, int64_t id_pattern,
    bool approximation_enable, int64_t number_of_trunc_bits,
    int64_t number_of_frac_bits)
 {
    assert((id_pattern >= 0));
    assert((id_pattern < pattern_dim));
    float* np_input_pointer_pattern;
    float* np_output_pointer_pattern;
    float* input_ptr;
    float* output_ptr;
    float* w_ptr;
    uint64_t pattern_size = input_channel_dim;
    std::vector<float> ap_h_vector;
    ap_h_vector.resize(pattern_size);
    float* ap_h_ptr = ap_h_vector.data();
    std::vector<uint32_t> ap_x_vector;
    ap_x_vector.resize(pattern_size);
    uint32_t* ap_x_ptr = ap_x_vector.data();
    std::vector<uint32_t> ap_y_vector;
    ap_y_vector.resize(pattern_size);
    uint32_t* ap_y_ptr = ap_y_vector.data();
    std::vector<uint32_t> ap_x_exponent_vector;
    ap_x_exponent_vector.resize(pattern_size);
    uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data();
    std::vector<uint32_t> ap_y_exponent_vector;
    ap_y_exponent_vector.resize(pattern_size);
    uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data();
    std::vector<uint32_t> ap_h_exponent_vector;
    ap_h_exponent_vector.resize(pattern_size);
    uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data();
    std::vector<uint64_t> ap_res_vector;
    ap_res_vector.resize(pattern_size);
    uint64_t* ap_res_ptr = ap_res_vector.data();
    uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
    std::vector<uint32_t> sign_temp_vector;
    sign_temp_vector.resize(pattern_size);
    uint32_t* sign_temp_ptr = sign_temp_vector.data();
    uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim;
    uint64_t output_pattern_size = feature_dim * x_dim * y_dim;
    np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size;
    np_output_pointer_pattern =
        np_output_pointer + id_pattern * output_pattern_size;
    uint64_t counter;
    uint64_t counter_x;
    uint64_t counter_y;
    uint64_t counter_feature;
    uint64_t pos_xy;
    uint64_t pos_xy_if;
    float temp_sum;
    uint64_t pattern_c_2 = x_dim * y_dim;
    for (counter_x = 0; counter_x < x_dim; counter_x++)
    {
        for (counter_y = 0; counter_y < y_dim; counter_y++)
        {
            pos_xy = counter_y + counter_x * y_dim;
            for (counter_feature = 0; counter_feature < feature_dim;
                counter_feature++)
            {
                pos_xy_if = counter_feature * pattern_c_2 + pos_xy;
                input_ptr = np_input_pointer_pattern + pos_xy;
                output_ptr = np_output_pointer_pattern + pos_xy_if;
                w_ptr = np_weight_pointer + counter_feature * input_channel_dim;
 #pragma omp simd
                for (counter = 0; counter < pattern_size; counter++)
                {
                    ap_h_ptr[counter] = input_ptr[counter * pattern_c_2];
                }
                approximation_multiplication_function(
                    ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits,
                    number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr,
                    ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr,
                    sign_temp_ptr, approximation_enable);
                temp_sum = 0.0;
 #pragma omp simd reduction(+ \
                           : temp_sum)
                for (counter = 0; counter < pattern_size; counter++)
                {
                    temp_sum += ap_h_ptr[counter];
                }
                output_ptr[0] = temp_sum;
            }
        }
    }
    return true;
 };
 bool MultiApp::update_with_init_vector_multi_pattern(
    int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
    int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
    int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
    int64_t number_of_processes, bool approximation_enable,
    int64_t number_of_trunc_bits, int64_t number_of_frac)
 {
    int64_t number_of_pattern = pattern_dim;
    int64_t pattern_id;
    float* np_input_pointer = (float*)np_input_pointer_addr;
    float* np_weight_pointer = (float*)np_weight_pointer_addr;
    float* np_output_pointer = (float*)np_output_pointer_addr;
    assert((np_input_pointer != nullptr));
    assert((np_output_pointer != nullptr));
    assert((np_weight_pointer != nullptr));
    assert((pattern_dim > 0));
    assert((feature_dim > 0));
    assert((x_dim > 0));
    assert((y_dim > 0));
    assert((input_channel_dim > 0));
    if (number_of_processes > 0)
    {
        omp_set_num_threads(number_of_processes);
        // For debugging: Only one thread
        // omp_set_num_threads(1);
 #pragma omp parallel for
        for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
        {
            update(np_input_pointer, np_weight_pointer,
                np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim,
                input_channel_dim, pattern_id, approximation_enable,
                number_of_trunc_bits, number_of_frac);
        }
    }
    else
    {
        update_gpu(np_input_pointer, np_weight_pointer,
            np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim,
            input_channel_dim, approximation_enable,
            number_of_trunc_bits, number_of_frac);
    }
    return true;
 };
 __global__ void kernel_approx_multiplication(float* __restrict__ input_pointer, float* __restrict__ weight_pointer,
    float* __restrict__ output_pointer, uint64_t pattern_dim,
    uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
    uint64_t input_channel_dim, size_t max_threadable_tasks,
    uint64_t input_index_scale, uint64_t number_of_frac_bits,
    bool approximation_enable, uint64_t number_of_trunc_bits,
    uint32_t ap_mask)
 {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < max_threadable_tasks)
    {
        int pattern_id = idx / feature_dim;
        int feature_id = idx - (pattern_id * feature_dim);
        int x_id = blockIdx.y;
        int y_id = blockIdx.z;
        float* weight_pointer_sub = weight_pointer + feature_id * input_channel_dim;
        float* input_pointer_sub = input_pointer + pattern_id * input_channel_dim * x_dim * y_dim + x_id * y_dim + y_id;
        float* output_pointer_sub = output_pointer +
            pattern_id * feature_dim * x_dim * y_dim +
            feature_id * x_dim * y_dim + x_id * y_dim + y_id;
        *output_pointer_sub = 0.0;
        size_t counter;
        for (counter = 0; counter < input_channel_dim; counter++)
        {
            *output_pointer_sub += gpu_approximation_multiplication_function(
                weight_pointer_sub[counter],
                input_pointer_sub[counter * input_index_scale],
                number_of_frac_bits, approximation_enable,
                number_of_trunc_bits, ap_mask);
        }
    }
 };
 bool MultiApp::update_gpu(float* np_input_pointer,
    float* np_weight_pointer,
    float* np_output_pointer, uint64_t pattern_dim,
    uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
    uint64_t input_channel_dim,
    bool approximation_enable, uint64_t number_of_trunc_bits,
    uint64_t number_of_frac_bits)
 {
    uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
    // std::cout << approximation_enable << std::endl;
    // std::cout << number_of_trunc_bits << std::endl;
    // std::cout << number_of_frac_bits << std::endl;
    cudaError_t status;
    assert((x_dim < 65535));
    assert((y_dim < 65535));
    // //////////////////////////////////////
    // Get infos about the device
    // //////////////////////////////////////
    int device;
    cudaDeviceProp prop;
    status = cudaGetDevice(&device);
    assert((status == cudaSuccess));
    // std::cout << "Device ID: " << device << std::endl;
    status = cudaGetDeviceProperties(&prop, device);
    assert((status == cudaSuccess));
    // std::cout << "Device name: " << prop.name << std::endl;
    // //////////////////////////////////////
    // Calculate the distribution on the GPU
    // //////////////////////////////////////
    int min_grid_size;
    int block_size;
    int grid_size;
    size_t dynamic_s_mem_size = 0;
    size_t max_threadable_tasks = pattern_dim * feature_dim * x_dim * y_dim;
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator
    status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
        (void*)kernel_approx_multiplication,
        dynamic_s_mem_size, max_threadable_tasks);
    assert((status == cudaSuccess));
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
    // Maximum dimensionality of grid of thread blocks: 3
    // Maximum x -dimension of a grid of thread blocks: (2^31)-1
    // Maximum y- or z-dimension of a grid of thread blocks: 65535
    // Round up according to array size
    grid_size = ((pattern_dim * feature_dim) + block_size - 1) / block_size;
    // std::cout << min_grid_size << std::endl;
    // std::cout << grid_size << std::endl;
    // std::cout << block_size << std::endl;
    // std::cout << max_threadable_tasks << std::endl;
    dim3 grid(grid_size, x_dim, y_dim);
    kernel_approx_multiplication<<<grid, block_size>>>(np_input_pointer,
        np_weight_pointer,
        np_output_pointer,
        pattern_dim,
        feature_dim,
        x_dim,
        y_dim,
        input_channel_dim,
        (pattern_dim * feature_dim),
        (x_dim * y_dim),
        number_of_frac_bits,
        approximation_enable,
        number_of_trunc_bits,
        ap_mask);
    cudaDeviceSynchronize();
    return true;
 };
--- a/network/CPP_Cuda/MultiApp.h
+++ b/network/CPP_Cuda/MultiApp.h
@ -0,0 +1,39 @@
 #ifndef SRC_MultiApp_H_
 #define SRC_MultiApp_H_
 #include <unistd.h>
 #include <cctype>
 #include <iostream>
 class MultiApp
 {
 public:
    MultiApp();
    ~MultiApp();
    bool update(float *np_input_pointer, float *np_weight_pointer,
                float *np_output_pointer, int64_t pattern_dim,
                int64_t feature_dim, int64_t x_dim, int64_t y_dim,
                int64_t input_channel_dim, int64_t id_pattern,
                bool approximation_enable, int64_t number_of_trunc_bits,
                int64_t number_of_frac);
    bool update_gpu(float *input_pointer, float *weight_pointer,
                    float *output_pointer, uint64_t pattern_dim,
                    uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
                    uint64_t input_channel_dim,
                    bool approximation_enable, uint64_t number_of_trunc_bits,
                    uint64_t number_of_frac);
    bool update_with_init_vector_multi_pattern(
        int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
        int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
        int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
        int64_t number_of_processes, bool approximation_enable,
        int64_t number_of_trunc_bits, int64_t number_of_frac);
 private:
 };
 #endif /* SRC_MultiApp_H_ */
--- a/network/CPP_Cuda/PyHDynamicCNNManyIP.cpp
+++ b/network/CPP_Cuda/PyHDynamicCNNManyIP.cpp
@ -0,0 +1,14 @@
 #include <pybind11/pybind11.h>
 #include "HDynamicCNNManyIP.h"
 namespace py = pybind11;
 PYBIND11_MODULE(PyHDynamicCNNManyIP, m)
 {
    m.doc() = "HDynamicCNNManyIP Module";
    py::class_<HDynamicCNNManyIP>(m, "HDynamicCNNManyIP")
        .def(py::init<>())
        .def("update",
            &HDynamicCNNManyIP::update_entrypoint);
 }
--- a/network/CPP_Cuda/PyHDynamicCNNManyIP.pyi
+++ b/network/CPP_Cuda/PyHDynamicCNNManyIP.pyi
@ -0,0 +1,18 @@
 #
 # AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
 #
 """HDynamicCNNManyIP Module"""
 from __future__ import annotations
 import PyHDynamicCNNManyIP
 import typing
 __all__ = [
    "HDynamicCNNManyIP"
 ]
 class HDynamicCNNManyIP():
    def __init__(self) -> None: ...
    def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ...
    pass
--- a/network/CPP_Cuda/PyMultiApp.cpp
+++ b/network/CPP_Cuda/PyMultiApp.cpp
@ -0,0 +1,14 @@
 #include <pybind11/pybind11.h>
 #include "MultiApp.h"
 namespace py = pybind11;
 PYBIND11_MODULE(PyMultiApp, m) {
  m.doc() = "MultiApp Module";
  py::class_<MultiApp>(m, "MultiApp")
      .def(py::init<>())
      .def("update_with_init_vector_multi_pattern",
           &MultiApp::update_with_init_vector_multi_pattern);
 }
--- a/network/CPP_Cuda/PyMultiApp.pyi
+++ b/network/CPP_Cuda/PyMultiApp.pyi
@ -0,0 +1,18 @@
 #
 # AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
 #
 """MultiApp Module"""
 from __future__ import annotations
 import PyMultiApp
 import typing
 __all__ = [
    "MultiApp"
 ]
 class MultiApp():
    def __init__(self) -> None: ...
    def update_with_init_vector_multi_pattern(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ...
    pass
--- a/network/CPP_Cuda/PySpikeGeneration2DManyIP.cpp
+++ b/network/CPP_Cuda/PySpikeGeneration2DManyIP.cpp
@ -0,0 +1,15 @@
 #include <pybind11/pybind11.h>
 #include "SpikeGeneration2DManyIP.h"
 namespace py = pybind11;
 PYBIND11_MODULE(PySpikeGeneration2DManyIP, m)
 {
  m.doc() = "SpikeGeneration2DManyIP Module";
  py::class_<SpikeGeneration2DManyIP>(m, "SpikeGeneration2DManyIP")
    .def(py::init<>())
    .def("spike_generation",
      &SpikeGeneration2DManyIP::spike_generation_entrypoint);
 }
--- a/network/CPP_Cuda/PySpikeGeneration2DManyIP.pyi
+++ b/network/CPP_Cuda/PySpikeGeneration2DManyIP.pyi
@ -0,0 +1,18 @@
 #
 # AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
 #
 """SpikeGeneration2DManyIP Module"""
 from __future__ import annotations
 import PySpikeGeneration2DManyIP
 import typing
 __all__ = [
    "SpikeGeneration2DManyIP"
 ]
 class SpikeGeneration2DManyIP():
    def __init__(self) -> None: ...
    def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ...
    pass
--- a/network/CPP_Cuda/SpikeGeneration2DManyIP.cu
+++ b/network/CPP_Cuda/SpikeGeneration2DManyIP.cu
@ -0,0 +1,390 @@
 #include "SpikeGeneration2DManyIP.h"
 #include <omp.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <cassert>
 #include <iostream>
 SpikeGeneration2DManyIP::SpikeGeneration2DManyIP()
 {
 };
 SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP()
 {
 };
 bool SpikeGeneration2DManyIP::spike_generation_entrypoint(
    int64_t input_pointer_addr, int64_t input_dim_0,
    int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
    int64_t random_values_pointer_addr, int64_t random_values_dim_0,
    int64_t random_values_dim_1, int64_t random_values_dim_2,
    int64_t random_values_dim_3, int64_t output_pointer_addr,
    int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
    int64_t output_dim_3, int64_t number_of_cpu_processes)
 {
    float* input_pointer = (float*)input_pointer_addr;
    float* random_values_pointer = (float*)random_values_pointer_addr;
    int64_t* output_pointer = (int64_t*)output_pointer_addr;
    // Input
    assert((input_pointer != nullptr));
    assert((input_dim_0 > 0));
    assert((input_dim_1 > 0));
    assert((input_dim_2 > 0));
    assert((input_dim_3 > 0));
    // Random
    assert((random_values_pointer != nullptr));
    assert((random_values_dim_0 > 0));
    assert((random_values_dim_1 > 0));
    assert((random_values_dim_2 > 0));
    assert((random_values_dim_3 > 0));
    // Output
    assert((output_pointer != nullptr));
    assert((output_dim_0 > 0));
    assert((output_dim_1 > 0));
    assert((output_dim_2 > 0));
    assert((output_dim_3 > 0));
    // Input
    size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
    size_t input_dim_c1 = input_dim_2 * input_dim_3;
    size_t input_dim_c2 = input_dim_3;
    // Random
    size_t random_values_dim_c0 =
        random_values_dim_1 * random_values_dim_2 * random_values_dim_3;
    size_t random_values_dim_c1 =
        random_values_dim_2 * random_values_dim_3;
    size_t random_values_dim_c2 = random_values_dim_3;
    // Output
    size_t output_dim_c0 =
        output_dim_1 * output_dim_2 * output_dim_3;
    size_t output_dim_c1 = output_dim_2 * output_dim_3;
    size_t output_dim_c2 = output_dim_3;
    size_t number_of_pattern = input_dim_0;
    size_t h_dim = input_dim_1;
    size_t spike_dim = output_dim_1;
    size_t x_dim = output_dim_2;
    size_t y_dim = output_dim_2;
    if (number_of_cpu_processes > 0)
    {
        omp_set_num_threads(number_of_cpu_processes);
        // DEBUG:
        // omp_set_num_threads(1);
        size_t pattern_id;
 #pragma omp parallel for
        for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
        {
            spike_generation(
                input_pointer,
                input_dim_c0,
                input_dim_c1,
                input_dim_c2,
                random_values_pointer,
                random_values_dim_c0,
                random_values_dim_c1,
                random_values_dim_c2,
                output_pointer,
                output_dim_c0,
                output_dim_c1,
                output_dim_c2,
                x_dim,
                y_dim,
                spike_dim,
                h_dim,
                pattern_id);
        }
    }
    else
    {
        gpu_spike_generation(
            input_pointer,
            input_dim_c0,
            input_dim_c1,
            input_dim_c2,
            random_values_pointer,
            random_values_dim_c0,
            random_values_dim_c1,
            random_values_dim_c2,
            output_pointer,
            output_dim_c0,
            output_dim_c1,
            output_dim_c2,
            x_dim,
            y_dim,
            spike_dim,
            h_dim,
            number_of_pattern);
    }
    return true;
 };
 bool SpikeGeneration2DManyIP::spike_generation(
    float* input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    float* random_values_pointer,
    size_t random_values_dim_c0,
    size_t random_values_dim_c1,
    size_t random_values_dim_c2,
    int64_t* output_pointer,
    size_t output_dim_c0,
    size_t output_dim_c1,
    size_t output_dim_c2,
    size_t x_dim,
    size_t y_dim,
    size_t spike_dim,
    size_t h_dim,
    size_t pattern_id)
 {
    size_t counter;
    size_t counter_x = 0;
    size_t counter_y = 0;
    float* p_ptr = nullptr;
    int64_t* out_ptr = nullptr;
    float* rand_ptr = nullptr;
    for (counter_x = 0; counter_x < x_dim; counter_x++)
    {
        for (counter_y = 0; counter_y < y_dim; counter_y++)
        {
            p_ptr = input_pointer + pattern_id * input_dim_c0 +
                counter_x * input_dim_c2 + counter_y;
            // + counter * input_dim_c1
            out_ptr = output_pointer + pattern_id * output_dim_c0 +
                counter_x * output_dim_c2 + counter_y;
            // + counter * output_dim_c1
            rand_ptr = random_values_pointer +
                pattern_id * random_values_dim_c0 +
                counter_x * random_values_dim_c2 + counter_y;
            // + counter * random_values_dim_c1
            for (counter = 0; counter < spike_dim; counter++)
            {
                out_ptr[counter * output_dim_c1] = lower_bound(p_ptr,
                    h_dim,
                    input_dim_c1,
                    rand_ptr[counter * random_values_dim_c1]);
            }
        }
    }
    return true;
 };
 // algorithmic idea stolen from libc++
 size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr,
    size_t data_length,
    size_t data_ptr_stride,
    float compare_to_value)
 {
    size_t start_of_range = 0;
    size_t length_of_range = data_length;
    while (length_of_range != 0)
    {
        size_t half_length = length_of_range >> 1;
        size_t actual_position = start_of_range + half_length;
        if (data_ptr[actual_position * data_ptr_stride] < compare_to_value)
        {
            start_of_range = ++actual_position;
            length_of_range -= half_length + 1;
        }
        else
            length_of_range = half_length;
    }
    return start_of_range;
 };
 __device__ size_t gpu_lower_bound(float* __restrict__ data_ptr,
    size_t data_length,
    size_t data_ptr_stride,
    float compare_to_value)
 {
    size_t start_of_range = 0;
    size_t length_of_range = data_length;
    while (length_of_range != 0)
    {
        size_t half_length = length_of_range >> 1;
        size_t actual_position = start_of_range + half_length;
        if (data_ptr[actual_position * data_ptr_stride] < compare_to_value)
        {
            start_of_range = ++actual_position;
            length_of_range -= half_length + 1;
        }
        else
            length_of_range = half_length;
    }
    return start_of_range;
 };
 __global__ void kernel_spike_generation(
    float* __restrict__ input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    float* __restrict__ random_values_pointer,
    size_t random_values_dim_c0,
    size_t random_values_dim_c1,
    size_t random_values_dim_c2,
    int64_t* __restrict__ output_pointer,
    size_t output_dim_c0,
    size_t output_dim_c1,
    size_t output_dim_c2,
    size_t x_dim,
    size_t y_dim,
    size_t spike_dim,
    size_t h_dim,
    size_t max_threadable_tasks)
 {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < max_threadable_tasks)
    {
        size_t pattern_id = idx / spike_dim;
        size_t position_spike = idx - (pattern_id * spike_dim);
        size_t position_x = blockIdx.y;
        size_t position_y = blockIdx.z;
        float* p_ptr = input_pointer + pattern_id * input_dim_c0 +
            position_x * input_dim_c2 + position_y;
        int64_t* out_ptr = output_pointer + pattern_id * output_dim_c0 +
            position_x * output_dim_c2 + position_y
            + position_spike * output_dim_c1;
        float* rand_ptr = random_values_pointer +
            pattern_id * random_values_dim_c0 +
            position_x * random_values_dim_c2 + position_y
            + position_spike * random_values_dim_c1;
        *out_ptr = gpu_lower_bound(p_ptr,
            h_dim,
            input_dim_c1,
            *rand_ptr);
    }
 };
 bool SpikeGeneration2DManyIP::gpu_spike_generation(
    float* input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    float* random_values_pointer,
    size_t random_values_dim_c0,
    size_t random_values_dim_c1,
    size_t random_values_dim_c2,
    int64_t* output_pointer,
    size_t output_dim_c0,
    size_t output_dim_c1,
    size_t output_dim_c2,
    size_t x_dim,
    size_t y_dim,
    size_t spike_dim,
    size_t h_dim,
    size_t number_of_pattern)
 {
    cudaError_t status;
    assert((x_dim < 65535));
    assert((y_dim < 65535));
    // // //////////////////////////////////////
    // // Get infos about the device
    // // //////////////////////////////////////
    // int device;
    // cudaDeviceProp prop;
    // status = cudaGetDevice(&device);
    // assert((status == cudaSuccess));
    // // std::cout << "Device ID: " << device << std::endl;
    // status = cudaGetDeviceProperties(&prop, device);
    // assert((status == cudaSuccess));
    // // std::cout << "Device name: " << prop.name << std::endl;
  // //////////////////////////////////////
  // Calculate the distribution on the GPU
  // //////////////////////////////////////
    int min_grid_size;
    int block_size;
    int grid_size;
    size_t dynamic_s_mem_size = 0;
    size_t max_threadable_tasks = number_of_pattern * spike_dim * x_dim * y_dim;
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator
    status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
        (void*)kernel_spike_generation,
        dynamic_s_mem_size, max_threadable_tasks);
    assert((status == cudaSuccess));
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
    // Maximum dimensionality of grid of thread blocks: 3
    // Maximum x -dimension of a grid of thread blocks: (2^31)-1
    // Maximum y- or z-dimension of a grid of thread blocks: 65535
    // Round up according to array size
    // (I will separate x and y into other grid dimentsions soon)
    grid_size = ((number_of_pattern * spike_dim) + block_size - 1) / block_size;
    // std::cout << min_grid_size << std::endl;
    // std::cout << grid_size << std::endl;
    // std::cout << block_size << std::endl;
    // std::cout << max_threadable_tasks << std::endl;
    dim3 grid(grid_size, x_dim, y_dim);
    kernel_spike_generation<<<grid, block_size >>>(
        input_pointer,
        input_dim_c0,
        input_dim_c1,
        input_dim_c2,
        random_values_pointer,
        random_values_dim_c0,
        random_values_dim_c1,
        random_values_dim_c2,
        output_pointer,
        output_dim_c0,
        output_dim_c1,
        output_dim_c2,
        x_dim,
        y_dim,
        spike_dim,
        h_dim,
        (number_of_pattern * spike_dim));
    cudaDeviceSynchronize();
    return true;
 };
--- a/network/CPP_Cuda/SpikeGeneration2DManyIP.h
+++ b/network/CPP_Cuda/SpikeGeneration2DManyIP.h
@ -0,0 +1,68 @@
 #ifndef SRC_SPIKEGENERATION2DMANYIP_H_
 #define SRC_SPIKEGENERATION2DMANYIP_H_
 #include <unistd.h>
 #include <cctype>
 #include <iostream>
 class SpikeGeneration2DManyIP
 {
    public:
    SpikeGeneration2DManyIP();
    ~SpikeGeneration2DManyIP();
    bool spike_generation_entrypoint(
        int64_t input_pointer_addr, int64_t input_dim_0,
        int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
        int64_t random_values_pointer_addr, int64_t random_values_dim_0,
        int64_t random_values_dim_1, int64_t random_values_dim_2,
        int64_t random_values_dim_3, int64_t output_pointer_addr,
        int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
        int64_t output_dim_3, int64_t number_of_cpu_processes);
    bool spike_generation(
        float* input_pointer,
        size_t input_dim_c0,
        size_t input_dim_c1,
        size_t input_dim_c2,
        float* random_values_pointer,
        size_t random_values_dim_c0,
        size_t random_values_dim_c1,
        size_t random_values_dim_c2,
        int64_t* output_pointer,
        size_t output_dim_c0,
        size_t output_dim_c1,
        size_t output_dim_c2,
        size_t x_dim,
        size_t y_dim,
        size_t spike_dim,
        size_t h_dim,
        size_t pattern_id);
    bool gpu_spike_generation(
        float* input_pointer,
        size_t input_dim_c0,
        size_t input_dim_c1,
        size_t input_dim_c2,
        float* random_values_pointer,
        size_t random_values_dim_c0,
        size_t random_values_dim_c1,
        size_t random_values_dim_c2,
        int64_t* output_pointer,
        size_t output_dim_c0,
        size_t output_dim_c1,
        size_t output_dim_c2,
        size_t x_dim,
        size_t y_dim,
        size_t spike_dim,
        size_t h_dim,
        size_t number_of_pattern);
    private:
    size_t lower_bound(float* data_ptr, size_t data_length,
        size_t data_ptr_stride,
        float compare_to_value);
 };
 #endif /* SRC_SPIKEGENERATION2DMANYIP_H_ */
--- a/network/CPP_Cuda/approximation_multiplication_function.cpp
+++ b/network/CPP_Cuda/approximation_multiplication_function.cpp
@ -0,0 +1,138 @@
 #include <unistd.h>
 #include <bitset>
 #include <cassert>
 #include <cctype>
 #include "error_term.cpp"
 // Best way to plot the bits
 // std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n";
 // The result needs to be written back into h_pointer (which contains h)
 // Don't write to w_pointer.
 void approximation_multiplication_function(
    float *h_pointer, float *w_pointer, int64_t pattern_length,
    uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits,
    uint32_t *ap_x_ptr, uint32_t *ap_y_ptr, uint32_t *ap_x_exponent_ptr,
    uint32_t *ap_y_exponent_ptr, uint32_t *ap_h_exponent_ptr, uint32_t ap_mask,
    uint64_t *ap_res_ptr, uint32_t *sign_temp_ptr, bool approximation_enable) {
  uint64_t counter;
  uint32_t *w_pointer_mod = (uint32_t *)w_pointer;
  uint32_t *h_pointer_mod = (uint32_t *)h_pointer;
 //  Calculate the new sign
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^
                             (h_pointer_mod[counter] & 0x80000000);
  }
 // Extract the exponent
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24;
  }
  // Cast and "normalize"
  uint64_t shift_value = 32 - number_of_frac_bits;
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_x_ptr[counter] =
        ((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_y_ptr[counter] =
        ((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
  }
 // Make the zero -g-r-e-a-t- correct again
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    if (h_pointer[counter] == 0) {
      ap_x_ptr[counter] = 0;
    }
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    if (w_pointer[counter] == 0) {
      ap_y_ptr[counter] = 0;
    }
  }
 // res = x*y
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_res_ptr[counter] = static_cast<uint64_t>(ap_x_ptr[counter]) * static_cast<uint64_t>(ap_y_ptr[counter]);
  }
  uint32_t temp;
  if (approximation_enable == true){
    // Go through the vector values
    for (counter = 0; counter < pattern_length; counter++) {
      temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask,
                      number_of_trunc_bits);
      if (temp > ap_res_ptr[counter]) {
        ap_res_ptr[counter] = 0;
      } else {
        ap_res_ptr[counter] -= temp;
      }
    }
  }
 // Cast from int to float
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer[counter] = static_cast<float>(ap_res_ptr[counter]);
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
  }
 // devide by the 2^number_of_frac_bits
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits;
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] +
           ap_h_exponent_ptr[counter];
    if (temp > 252) {
      ap_h_exponent_ptr[counter] = temp - 252;
    } else {
      // Here I try to catch the case that the new exponent is too small
      ap_h_exponent_ptr[counter] = 0;
    }
  }
 // Remove the old exponent
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9;
  }
 // Install the new exponent
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23;
  }
 // Add the sign back
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer_mod[counter] += sign_temp_ptr[counter];
  }
  return;
 }
--- a/network/CPP_Cuda/error_term.cpp
+++ b/network/CPP_Cuda/error_term.cpp
@ -0,0 +1,28 @@
 #include <unistd.h>
 #include <cassert>
 #include <cctype>
 uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask,
                    uint32_t number_of_trunc_bits) {
  uint32_t error_value = 0;
  uint32_t temp_shift_a = a;
  uint32_t temp_shift_b = b & ap_mask;
  uint32_t counter_trunc;
  uint32_t temp;
  // Go through the bits
  for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
       counter_trunc++) {
    temp = temp_shift_a & 1;
    if (temp == 1) {
      error_value += temp_shift_b & ap_mask;
    }
    temp_shift_a >>= 1;
    temp_shift_b <<= 1;
  }
  return error_value;
 }
--- a/network/CPP_Cuda/gpu_approximation_multiplication_function.cu
+++ b/network/CPP_Cuda/gpu_approximation_multiplication_function.cu
@ -0,0 +1,102 @@
 #include "gpu_error_term.cu"
 __device__ float gpu_approximation_multiplication_function(
    float weight,
    float input,
    uint64_t number_of_frac_bits,
    bool approximation_enable,
    uint64_t number_of_trunc_bits,
    uint32_t ap_mask)
 {
    float weight_copy = weight;
    float input_copy = input;
    uint32_t *weight_pointer_mod = (uint32_t *)&weight_copy;
    uint32_t *input_pointer_mod = (uint32_t *)&input_copy;
    //  Calculate the new sign
    uint32_t sign_temp = (*weight_pointer_mod & 0x80000000) ^
                         (*input_pointer_mod & 0x80000000);
    // Extract the exponent
    uint32_t ap_input_exponent = (*input_pointer_mod << 1) >> 24;
    uint32_t ap_weight_exponent = (*weight_pointer_mod << 1) >> 24;
    // Cast and "normalize"
    uint64_t shift_value = 32 - number_of_frac_bits;
    uint32_t ap_input_mantissa =
        ((*input_pointer_mod << 8) | 0x80000000) >> shift_value;
    uint32_t ap_weight_mantissa =
        ((*weight_pointer_mod << 8) | 0x80000000) >> shift_value;
    // Make the zero -g-r-e-a-t- correct again
    if (input == 0)
    {
        ap_input_mantissa = 0;
    }
    if (weight == 0)
    {
        ap_weight_mantissa = 0;
    }
    // res = x*y
    uint64_t ap_result = static_cast<uint64_t>(ap_input_mantissa) * static_cast<uint64_t>(ap_weight_mantissa);
    uint32_t temp;
    // --------------------------------------------
    // Approx
    // --------------------------------------------
    if (approximation_enable == true)
    {
        // Go through the vector values
        temp = gpu_error_term(ap_weight_mantissa, ap_input_mantissa, ap_mask,
                              number_of_trunc_bits);
        if (temp > ap_result)
        {
            ap_result = 0;
        }
        else
        {
            ap_result -= temp;
        }
    }
    // Cast from int to float
    float output = static_cast<float>(ap_result);
    if (ap_result == 0)
    {
        output = 0.0;
    }
    else
    {
        uint32_t *output_pointer_mod = (uint32_t *)&output;
        uint32_t ap_output_exponent = (*output_pointer_mod << 1) >> 24;
        ap_output_exponent -= 2 * number_of_frac_bits;
        temp = ap_input_exponent + ap_weight_exponent + ap_output_exponent;
        if (temp > 252)
        {
            ap_output_exponent = temp - 252;
        }
        else
        {
            // Here I try to catch the case that the new exponent is too small
            ap_output_exponent = 0;
        }
        // Remove the old exponent
        *output_pointer_mod = (*output_pointer_mod << 9) >> 9;
        // Install the new exponent
        *output_pointer_mod += ap_output_exponent << 23;
        // Add the sign back
        *output_pointer_mod += sign_temp;
    }
    return output;
 };
--- a/network/CPP_Cuda/gpu_error_term.cu
+++ b/network/CPP_Cuda/gpu_error_term.cu
@ -0,0 +1,28 @@
 __device__ uint32_t gpu_error_term(uint32_t ap_weight_mantissa,
                                   uint32_t ap_input_mantissa,
                                   uint32_t ap_mask,
                                   uint32_t number_of_trunc_bits)
 {
    uint32_t error_value = 0;
    uint32_t temp_shift_a = ap_weight_mantissa;
    uint32_t temp_shift_b = ap_input_mantissa & ap_mask;
    uint32_t counter_trunc;
    uint32_t temp;
    // Go through the bits
    for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
         counter_trunc++)
    {
        temp = temp_shift_a & 1;
        if (temp == 1)
        {
            error_value += temp_shift_b & ap_mask;
        }
        temp_shift_a >>= 1;
        temp_shift_b <<= 1;
    }
    return error_value;
 }
--- a/network/CPP_Cuda/pybind11_auto_pyi.py
+++ b/network/CPP_Cuda/pybind11_auto_pyi.py
@ -0,0 +1,23 @@
 # %%
 # pip install pybind11-stubgen
 from pybind11_stubgen import ModuleStubsGenerator  # type: ignore
 import glob
 def process(module_name: str) -> None:
    module = ModuleStubsGenerator(module_name)
    module.parse()
    module.write_setup_py = False
    with open(module_name + ".pyi", "w") as fp:
        fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n")
        fp.write("\n".join(module.to_lines()))
 Files = glob.glob("*.so")
 for fid in Files:
    Idx: int = fid.find(".")
    module_name: str = fid[:Idx]
    print("Processing: " + module_name)
    process(module_name)
--- a/network/CPP_NoCuda/HDynamicCNNManyIP.cpp
+++ b/network/CPP_NoCuda/HDynamicCNNManyIP.cpp
@ -0,0 +1,339 @@
 #include "HDynamicCNNManyIP.h"
 #include <omp.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <cassert>
 #include <iostream>
 HDynamicCNNManyIP::HDynamicCNNManyIP()
 {
 };
 HDynamicCNNManyIP::~HDynamicCNNManyIP()
 {
 };
 bool HDynamicCNNManyIP::update_entrypoint(
    int64_t h_pointer_addr,
    int64_t h_dim_0,
    int64_t h_dim_1,
    int64_t h_dim_2,
    int64_t h_dim_3,
    int64_t epsilon_xy_pointer_addr,
    int64_t epsilon_xy_dim_0,
    int64_t epsilon_xy_dim_1,
    int64_t epsilon_xy_dim_2,
    int64_t epsilon_t_pointer_addr,
    int64_t epsilon_t_dim_0,
    int64_t weights_pointer_addr,
    int64_t weights_dim_0,
    int64_t weights_dim_1,
    int64_t input_pointer_addr,
    int64_t input_dim_0,
    int64_t input_dim_1,
    int64_t input_dim_2,
    int64_t input_dim_3,
    int64_t init_vector_pointer_addr,
    int64_t init_vector_dim_0,
    int64_t number_of_processes,
    float forgetting_offset,
    int64_t gpu_tuning_factor)
 {
    size_t number_of_pattern = input_dim_0;
    size_t h_dim = init_vector_dim_0;
    float* h_init_ptr = (float*)init_vector_pointer_addr;
    assert((h_init_ptr != nullptr));
    assert((h_dim > 0));
    float* h_pointer = (float*)h_pointer_addr;
    assert((h_pointer != nullptr));
    assert((h_dim_0 > 0));
    assert((h_dim_1 > 0));
    assert((h_dim_2 > 0));
    assert((h_dim_3 > 0));
    size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3;
    size_t h_dim_c1 = h_dim_2 * h_dim_3;
    size_t h_dim_c2 = h_dim_3;
    float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr;
    assert((epsilon_xy_pointer != nullptr));
    assert((epsilon_xy_dim_0 > 0));
    assert((epsilon_xy_dim_1 > 0));
    size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1;
    size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2;
    float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr;
    assert((epsilon_t_pointer != nullptr));
    assert((epsilon_t_dim_0 > 0));
    float* weights_pointer = (float*)weights_pointer_addr;
    assert((weights_pointer != nullptr));
    assert((weights_dim_0 > 0));
    assert((weights_dim_1 > 0));
    size_t weights_dim_c0 = weights_dim_1;
    int64_t* input_pointer = (int64_t*)input_pointer_addr;
    assert((input_pointer != nullptr));
    assert((input_dim_0 > 0));
    assert((input_dim_1 > 0));
    assert((input_dim_2 > 0));
    assert((input_dim_3 > 0));
    size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
    size_t input_dim_c1 = input_dim_2 * input_dim_3;
    size_t input_dim_c2 = input_dim_3;
    assert((h_dim == weights_dim_1));
    size_t number_of_spikes = input_dim_1;
    size_t dim_x = input_dim_2;
    size_t dim_y = input_dim_3;
    float forgetting_offset_local = forgetting_offset / static_cast<float>(h_dim);
    // --------------------
    if (number_of_processes > 0)
    {
        omp_set_num_threads(number_of_processes);
        size_t pattern_id;
 #pragma omp parallel for
        for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
        {
            update(
                h_init_ptr,
                h_pointer,
                h_dim_c0,
                h_dim_c1,
                h_dim_c2,
                h_dim,
                epsilon_xy_pointer,
                epsilon_xy_dim_c0,
                epsilon_xy_dim_c1,
                epsilon_t_pointer,
                weights_pointer,
                weights_dim_c0,
                input_pointer,
                input_dim_c0,
                input_dim_c1,
                input_dim_c2,
                number_of_spikes,
                dim_x,
                dim_y,
                forgetting_offset,
                forgetting_offset_local,
                pattern_id);
        }
    }
    else
    {
        std::cout << "Error: number_of_processes <= 0" << std::endl;
        return false;
    }
    return true;
 };
 bool HDynamicCNNManyIP::update(
    float* h_init_ptr,
    float* h_pointer,
    size_t h_dim_c0,
    size_t h_dim_c1,
    size_t h_dim_c2,
    size_t h_dim,
    float* epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    size_t epsilon_xy_dim_c1,
    float* epsilon_t_pointer,
    float* weights_pointer,
    size_t weights_dim_c0,
    int64_t* input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    size_t number_of_spikes,
    size_t dim_x,
    size_t dim_y,
    float forgetting_offset,
    float forgetting_offset_local,
    size_t pattern_id)
 {
    float* h_ptr;
    float* epsilon_xy_ptr;
    int64_t* input_ptr;
    size_t counter_x;
    size_t counter_y;
    for (counter_x = 0; counter_x < dim_x; counter_x++)
    {
        for (counter_y = 0; counter_y < dim_y; counter_y++)
        {
            epsilon_xy_ptr = epsilon_xy_pointer +
                counter_x * epsilon_xy_dim_c1 + counter_y;
            h_ptr = h_pointer +
                pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y;
            input_ptr = input_pointer +
                pattern_id * input_dim_c0 + counter_x * input_dim_c2 + counter_y;
            update_one_ip(
                h_init_ptr,
                h_ptr,
                h_dim_c1,
                h_dim,
                weights_pointer,
                weights_dim_c0,
                input_ptr,
                input_dim_c1,
                epsilon_xy_ptr,
                epsilon_xy_dim_c0,
                epsilon_t_pointer,
                number_of_spikes,
                forgetting_offset,
                forgetting_offset_local);
        }
    }
    return true;
 };
 void HDynamicCNNManyIP::update_one_ip(
    float* h_init_ptr,
    float* h_pointer,
    size_t h_dim_c1,
    size_t h_dim,
    float* weights_pointer,
    size_t weights_dim_c0,
    int64_t* input_pointer,
    size_t input_dim_c1,
    float* epsilon_xy_pointer,
    size_t epsilon_xy_dim_c0,
    float* epsilon_t_pointer,
    size_t number_of_spikes,
    float forgetting_offset,
    float forgetting_offset_local)
 {
    float* h_temp = new float[h_dim];
    float* h_subsegment = new float[h_dim];
    memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim);
    size_t counter_spike;
    size_t counter;
    float h_temp_sum;
    float temp_value;
    float epsilon_subsegment;
    float epsilon_scale = 1.0;
    int64_t* spike;
    float* w_ptr;
    for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++)
    {
        if (epsilon_scale > 1E10)
        {
            temp_value = 1.0 / epsilon_scale;
 #pragma omp simd
            for (counter = 0; counter < h_dim; counter++)
            {
                h_subsegment[counter] *= temp_value;
            }
            epsilon_scale = 1.0;
        }
        spike = input_pointer + counter_spike * input_dim_c1;
        if (*spike >= 0)
        {
            epsilon_subsegment =
                epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike];
            w_ptr = weights_pointer + *spike * weights_dim_c0;
            memcpy(h_temp, h_subsegment, sizeof(float) * h_dim);
 #pragma omp simd
            for (counter = 0; counter < h_dim; counter++)
            {
                h_temp[counter] *= w_ptr[counter];
            }
            h_temp_sum = 0.0;
 #pragma omp simd reduction(+ : h_temp_sum)
            for (counter = 0; counter < h_dim; counter++)
            {
                h_temp_sum += h_temp[counter];
            }
            if (h_temp_sum > 1E-10)
            {
                temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
 #pragma omp simd
                for (counter = 0; counter < h_dim; counter++)
                {
                    h_temp[counter] *= temp_value;
                }
 #pragma omp simd
                for (counter = 0; counter < h_dim; counter++)
                {
                    h_subsegment[counter] += h_temp[counter];
                }
                if (forgetting_offset_local > 0.0)
                {
                    temp_value =
                        epsilon_scale * epsilon_subsegment * forgetting_offset_local;
 #pragma omp simd
                    for (counter = 0; counter < h_dim; counter++)
                    {
                        h_subsegment[counter] += temp_value;
                    }
                    epsilon_scale *=
                        1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
                }
                else
                {
                    epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
                }
            }
        }
    }
    temp_value = 1.0 / epsilon_scale;
 #pragma omp simd
    for (counter = 0; counter < h_dim; counter++)
    {
        h_pointer[counter * h_dim_c1] =
            h_subsegment[counter] * temp_value;
    }
    delete[] h_temp;
    delete[] h_subsegment;
    return;
 };
--- a/network/CPP_NoCuda/HDynamicCNNManyIP.h
+++ b/network/CPP_NoCuda/HDynamicCNNManyIP.h
@ -0,0 +1,85 @@
 #ifndef SRC_HDYNAMICCNNMANYIP_H_
 #define SRC_HDYNAMICCNNMANYIP_H_
 #include <unistd.h>
 #include <cctype>
 #include <iostream>
 class HDynamicCNNManyIP
 {
    public:
    HDynamicCNNManyIP();
    ~HDynamicCNNManyIP();
    bool update_entrypoint(
        int64_t h_pointer_addr,
        int64_t h_dim_0,
        int64_t h_dim_1,
        int64_t h_dim_2,
        int64_t h_dim_3,
        int64_t epsilon_xy_pointer_addr,
        int64_t epsilon_xy_dim_0,
        int64_t epsilon_xy_dim_1,
        int64_t epsilon_xy_dim_2,
        int64_t epsilon_t_pointer_addr,
        int64_t epsilon_t_dim_0,
        int64_t weights_pointer_addr,
        int64_t weights_dim_0,
        int64_t weights_dim_1,
        int64_t input_pointer_addr,
        int64_t input_dim_0,
        int64_t input_dim_1,
        int64_t input_dim_2,
        int64_t input_dim_3,
        int64_t init_vector_pointer_addr,
        int64_t init_vector_dim_0,
        int64_t number_of_processes,
        float forgetting_offset,
        int64_t gpu_tuning_factor);
    private:
    bool update(
        float* h_init_ptr,
        float* h_pointer,
        size_t h_dim_c0,
        size_t h_dim_c1,
        size_t h_dim_c2,
        size_t h_dim,
        float* epsilon_xy_pointer,
        size_t epsilon_xy_dim_c0,
        size_t epsilon_xy_dim_c1,
        float* epsilon_t_pointer,
        float* weights_pointer,
        size_t weights_dim_c0,
        int64_t* input_pointer,
        size_t input_dim_c0,
        size_t input_dim_c1,
        size_t input_dim_c2,
        size_t number_of_spikes,
        size_t dim_x,
        size_t dim_y,
        float forgetting_offset,
        float forgetting_offset_local,
        size_t pattern_id);
    void update_one_ip(
        float* h_init_ptr,
        float* h_pointer,
        size_t h_dim_c1,
        size_t h_dim,
        float* weights_pointer,
        size_t weights_dim_c0,
        int64_t* input_pointer,
        size_t input_dim_c1,
        float* epsilon_xy_pointer,
        size_t epsilon_xy_dim_c0,
        float* epsilon_t_pointer,
        size_t number_of_spikes,
        float forgetting_offset,
        float forgetting_offset_local);
 };
 #endif /* SRC_HDYNAMICCNNMANYIP_H_ */
--- a/network/CPP_NoCuda/Makefile
+++ b/network/CPP_NoCuda/Makefile
@ -0,0 +1,64 @@
 # Change to your python bin directory (tested with Python 3.10.4)
 PYBIN=~/P3.10GPU/bin/
 CC=/usr/lib64/ccache/clang++
 PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes`
 PARAMETERS_O= -O3 -std=c++14 $(PYBIND11INCLUDE) -fPIC -Wall -fopenmp=libomp
 PARAMETERS_Linker=-shared -lm -lomp -lstdc++ -Wall
 PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix`
 all: PyHDynamicCNNManyIP \
 		PySpikeGeneration2DManyIP \
 		PyMultiApp 
 #######################
 HDynamicCNNManyIP.o: HDynamicCNNManyIP.h HDynamicCNNManyIP.cpp
 	$(CC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cpp -o HDynamicCNNManyIP.o
 PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp 
 	$(CC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o
 PyHDynamicCNNManyIP: HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
 	$(CC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
 	cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX)
 	$(PYBIN)python3 pybind11_auto_pyi.py
 #######################
 SpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h SpikeGeneration2DManyIP.cpp
 	$(CC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cpp -o SpikeGeneration2DManyIP.o
 PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp 
 	$(CC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o
 PySpikeGeneration2DManyIP: SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
 	$(CC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
 	cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX)
 	$(PYBIN)python3 pybind11_auto_pyi.py
 #######################
 MultiApp.o: MultiApp.h MultiApp.cpp approximation_multiplication_function.cpp \
                error_term.cpp 
 	$(CC) $(PARAMETERS_O) -c MultiApp.cpp -o MultiApp.o
 PyMultiApp.o: MultiApp.h PyMultiApp.cpp
 	$(CC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o
 PyMultiApp: MultiApp.o PyMultiApp.o
 	$(CC) $(PARAMETERS_Linker)  -o PyMultiApp MultiApp.o PyMultiApp.o
 	cp PyMultiApp PyMultiApp$(PYPOSTFIX)
 	$(PYBIN)python3 pybind11_auto_pyi.py
 #######################
 clean:
 	rm -f PyHDynamicCNNManyIP
 	rm -f PySpikeGeneration2DManyIP
 	rm -f PyMultiApp
 	rm -f *.o
 	rm -f *.so
--- a/network/CPP_NoCuda/MultiApp.cpp
+++ b/network/CPP_NoCuda/MultiApp.cpp
@ -0,0 +1,187 @@
 #include "MultiApp.h"
 #include <omp.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iostream>
 #include <vector>
 #include "approximation_multiplication_function.cpp"
 MultiApp::MultiApp()
 {
 };
 MultiApp::~MultiApp()
 {
 };
 bool MultiApp::update(float* np_input_pointer,
  float* np_weight_pointer,
  float* np_output_pointer, int64_t pattern_dim,
  int64_t feature_dim, int64_t x_dim, int64_t y_dim,
  int64_t input_channel_dim, int64_t id_pattern,
  bool approximation_enable, int64_t number_of_trunc_bits,
  int64_t number_of_frac_bits)
 {
  assert((id_pattern >= 0));
  assert((id_pattern < pattern_dim));
  float* np_input_pointer_pattern;
  float* np_output_pointer_pattern;
  float* input_ptr;
  float* output_ptr;
  float* w_ptr;
  uint64_t pattern_size = input_channel_dim;
  std::vector<float> ap_h_vector;
  ap_h_vector.resize(pattern_size);
  float* ap_h_ptr = ap_h_vector.data();
  std::vector<uint32_t> ap_x_vector;
  ap_x_vector.resize(pattern_size);
  uint32_t* ap_x_ptr = ap_x_vector.data();
  std::vector<uint32_t> ap_y_vector;
  ap_y_vector.resize(pattern_size);
  uint32_t* ap_y_ptr = ap_y_vector.data();
  std::vector<uint32_t> ap_x_exponent_vector;
  ap_x_exponent_vector.resize(pattern_size);
  uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data();
  std::vector<uint32_t> ap_y_exponent_vector;
  ap_y_exponent_vector.resize(pattern_size);
  uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data();
  std::vector<uint32_t> ap_h_exponent_vector;
  ap_h_exponent_vector.resize(pattern_size);
  uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data();
  std::vector<uint64_t> ap_res_vector;
  ap_res_vector.resize(pattern_size);
  uint64_t* ap_res_ptr = ap_res_vector.data();
  uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
  std::vector<uint32_t> sign_temp_vector;
  sign_temp_vector.resize(pattern_size);
  uint32_t* sign_temp_ptr = sign_temp_vector.data();
  uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim;
  uint64_t output_pattern_size = feature_dim * x_dim * y_dim;
  np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size;
  np_output_pointer_pattern =
    np_output_pointer + id_pattern * output_pattern_size;
  uint64_t counter;
  uint64_t counter_x;
  uint64_t counter_y;
  uint64_t counter_feature;
  uint64_t pos_xy;
  uint64_t pos_xy_if;
  float temp_sum;
  uint64_t pattern_c_2 = x_dim * y_dim;
  for (counter_x = 0; counter_x < x_dim; counter_x++)
  {
    for (counter_y = 0; counter_y < y_dim; counter_y++)
    {
      pos_xy = counter_y + counter_x * y_dim;
      for (counter_feature = 0; counter_feature < feature_dim;
        counter_feature++)
      {
        pos_xy_if = counter_feature * pattern_c_2 + pos_xy;
        input_ptr = np_input_pointer_pattern + pos_xy;
        output_ptr = np_output_pointer_pattern + pos_xy_if;
        w_ptr = np_weight_pointer + counter_feature * input_channel_dim;
 #pragma omp simd
        for (counter = 0; counter < pattern_size; counter++)
        {
          ap_h_ptr[counter] = input_ptr[counter * pattern_c_2];
        }
        approximation_multiplication_function(
          ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits,
          number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr,
          ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr,
          sign_temp_ptr, approximation_enable);
        temp_sum = 0.0;
 #pragma omp simd reduction(+ \
                           : temp_sum)
        for (counter = 0; counter < pattern_size; counter++)
        {
          temp_sum += ap_h_ptr[counter];
        }
        output_ptr[0] = temp_sum;
      }
    }
  }
  return true;
 };
 bool MultiApp::update_with_init_vector_multi_pattern(
  int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
  int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
  int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
  int64_t number_of_processes, bool approximation_enable,
  int64_t number_of_trunc_bits, int64_t number_of_frac)
 {
  int64_t number_of_pattern = pattern_dim;
  int64_t pattern_id;
  float* np_input_pointer = (float*)np_input_pointer_addr;
  float* np_weight_pointer = (float*)np_weight_pointer_addr;
  float* np_output_pointer = (float*)np_output_pointer_addr;
  assert((np_input_pointer != nullptr));
  assert((np_output_pointer != nullptr));
  assert((np_weight_pointer != nullptr));
  assert((pattern_dim > 0));
  assert((feature_dim > 0));
  assert((x_dim > 0));
  assert((y_dim > 0));
  assert((input_channel_dim > 0));
  if (number_of_processes > 0)
  {
    omp_set_num_threads(number_of_processes);
    // For debugging: Only one thread
    // omp_set_num_threads(1);
 #pragma omp parallel for
    for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
    {
      update(np_input_pointer, np_weight_pointer,
        np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim,
        input_channel_dim, pattern_id, approximation_enable,
        number_of_trunc_bits, number_of_frac);
    }
  }
  else
  {
    std::cout << "Error: number_of_processes <= 0" << std::endl;
    return false;
  }
  return true;
 };
--- a/network/CPP_NoCuda/MultiApp.h
+++ b/network/CPP_NoCuda/MultiApp.h
@ -0,0 +1,32 @@
 #ifndef SRC_MultiApp_H_
 #define SRC_MultiApp_H_
 #include <unistd.h>
 #include <cctype>
 #include <iostream>
 class MultiApp
 {
    public:
    MultiApp();
    ~MultiApp();
    bool update(float* np_input_pointer, float* np_weight_pointer,
        float* np_output_pointer, int64_t pattern_dim,
        int64_t feature_dim, int64_t x_dim, int64_t y_dim,
        int64_t input_channel_dim, int64_t id_pattern,
        bool approximation_enable, int64_t number_of_trunc_bits,
        int64_t number_of_frac);
    bool update_with_init_vector_multi_pattern(
        int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
        int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
        int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
        int64_t number_of_processes, bool approximation_enable,
        int64_t number_of_trunc_bits, int64_t number_of_frac);
    private:
 };
 #endif /* SRC_MultiApp_H_ */
--- a/network/CPP_NoCuda/PyHDynamicCNNManyIP.cpp
+++ b/network/CPP_NoCuda/PyHDynamicCNNManyIP.cpp
@ -0,0 +1,14 @@
 #include <pybind11/pybind11.h>
 #include "HDynamicCNNManyIP.h"
 namespace py = pybind11;
 PYBIND11_MODULE(PyHDynamicCNNManyIP, m)
 {
    m.doc() = "HDynamicCNNManyIP Module";
    py::class_<HDynamicCNNManyIP>(m, "HDynamicCNNManyIP")
        .def(py::init<>())
        .def("update",
            &HDynamicCNNManyIP::update_entrypoint);
 }
--- a/network/CPP_NoCuda/PyHDynamicCNNManyIP.pyi
+++ b/network/CPP_NoCuda/PyHDynamicCNNManyIP.pyi
@ -0,0 +1,18 @@
 #
 # AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
 #
 """HDynamicCNNManyIP Module"""
 from __future__ import annotations
 import PyHDynamicCNNManyIP
 import typing
 __all__ = [
    "HDynamicCNNManyIP"
 ]
 class HDynamicCNNManyIP():
    def __init__(self) -> None: ...
    def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ...
    pass
--- a/network/CPP_NoCuda/PyMultiApp.cpp
+++ b/network/CPP_NoCuda/PyMultiApp.cpp
@ -0,0 +1,14 @@
 #include <pybind11/pybind11.h>
 #include "MultiApp.h"
 namespace py = pybind11;
 PYBIND11_MODULE(PyMultiApp, m) {
  m.doc() = "MultiApp Module";
  py::class_<MultiApp>(m, "MultiApp")
      .def(py::init<>())
      .def("update_with_init_vector_multi_pattern",
           &MultiApp::update_with_init_vector_multi_pattern);
 }
--- a/network/CPP_NoCuda/PyMultiApp.pyi
+++ b/network/CPP_NoCuda/PyMultiApp.pyi
@ -0,0 +1,18 @@
 #
 # AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
 #
 """MultiApp Module"""
 from __future__ import annotations
 import PyMultiApp
 import typing
 __all__ = [
    "MultiApp"
 ]
 class MultiApp():
    def __init__(self) -> None: ...
    def update_with_init_vector_multi_pattern(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ...
    pass
--- a/network/CPP_NoCuda/PySpikeGeneration2DManyIP.cpp
+++ b/network/CPP_NoCuda/PySpikeGeneration2DManyIP.cpp
@ -0,0 +1,15 @@
 #include <pybind11/pybind11.h>
 #include "SpikeGeneration2DManyIP.h"
 namespace py = pybind11;
 PYBIND11_MODULE(PySpikeGeneration2DManyIP, m)
 {
  m.doc() = "SpikeGeneration2DManyIP Module";
  py::class_<SpikeGeneration2DManyIP>(m, "SpikeGeneration2DManyIP")
    .def(py::init<>())
    .def("spike_generation",
      &SpikeGeneration2DManyIP::spike_generation_entrypoint);
 }
--- a/network/CPP_NoCuda/PySpikeGeneration2DManyIP.pyi
+++ b/network/CPP_NoCuda/PySpikeGeneration2DManyIP.pyi
@ -0,0 +1,18 @@
 #
 # AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
 #
 """SpikeGeneration2DManyIP Module"""
 from __future__ import annotations
 import PySpikeGeneration2DManyIP
 import typing
 __all__ = [
    "SpikeGeneration2DManyIP"
 ]
 class SpikeGeneration2DManyIP():
    def __init__(self) -> None: ...
    def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ...
    pass
--- a/network/CPP_NoCuda/SpikeGeneration2DManyIP.cpp
+++ b/network/CPP_NoCuda/SpikeGeneration2DManyIP.cpp
@ -0,0 +1,205 @@
 #include "SpikeGeneration2DManyIP.h"
 #include <omp.h>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include <cassert>
 #include <iostream>
 SpikeGeneration2DManyIP::SpikeGeneration2DManyIP()
 {
 };
 SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP()
 {
 };
 bool SpikeGeneration2DManyIP::spike_generation_entrypoint(
    int64_t input_pointer_addr, int64_t input_dim_0,
    int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
    int64_t random_values_pointer_addr, int64_t random_values_dim_0,
    int64_t random_values_dim_1, int64_t random_values_dim_2,
    int64_t random_values_dim_3, int64_t output_pointer_addr,
    int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
    int64_t output_dim_3, int64_t number_of_cpu_processes)
 {
    float* input_pointer = (float*)input_pointer_addr;
    float* random_values_pointer = (float*)random_values_pointer_addr;
    int64_t* output_pointer = (int64_t*)output_pointer_addr;
    // Input
    assert((input_pointer != nullptr));
    assert((input_dim_0 > 0));
    assert((input_dim_1 > 0));
    assert((input_dim_2 > 0));
    assert((input_dim_3 > 0));
    // Random
    assert((random_values_pointer != nullptr));
    assert((random_values_dim_0 > 0));
    assert((random_values_dim_1 > 0));
    assert((random_values_dim_2 > 0));
    assert((random_values_dim_3 > 0));
    // Output
    assert((output_pointer != nullptr));
    assert((output_dim_0 > 0));
    assert((output_dim_1 > 0));
    assert((output_dim_2 > 0));
    assert((output_dim_3 > 0));
    // Input
    size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
    size_t input_dim_c1 = input_dim_2 * input_dim_3;
    size_t input_dim_c2 = input_dim_3;
    // Random
    size_t random_values_dim_c0 =
        random_values_dim_1 * random_values_dim_2 * random_values_dim_3;
    size_t random_values_dim_c1 =
        random_values_dim_2 * random_values_dim_3;
    size_t random_values_dim_c2 = random_values_dim_3;
    // Output
    size_t output_dim_c0 =
        output_dim_1 * output_dim_2 * output_dim_3;
    size_t output_dim_c1 = output_dim_2 * output_dim_3;
    size_t output_dim_c2 = output_dim_3;
    size_t number_of_pattern = input_dim_0;
    size_t h_dim = input_dim_1;
    size_t spike_dim = output_dim_1;
    size_t x_dim = output_dim_2;
    size_t y_dim = output_dim_2;
    if (number_of_cpu_processes > 0)
    {
        omp_set_num_threads(number_of_cpu_processes);
        // DEBUG:
        // omp_set_num_threads(1);
        size_t pattern_id;
 #pragma omp parallel for
        for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
        {
            spike_generation(
                input_pointer,
                input_dim_c0,
                input_dim_c1,
                input_dim_c2,
                random_values_pointer,
                random_values_dim_c0,
                random_values_dim_c1,
                random_values_dim_c2,
                output_pointer,
                output_dim_c0,
                output_dim_c1,
                output_dim_c2,
                x_dim,
                y_dim,
                spike_dim,
                h_dim,
                pattern_id);
        }
    }
    else
    {
        std::cout << "Error: number_of_processes <= 0" << std::endl;
        return false;
    }
    return true;
 };
 bool SpikeGeneration2DManyIP::spike_generation(
    float* input_pointer,
    size_t input_dim_c0,
    size_t input_dim_c1,
    size_t input_dim_c2,
    float* random_values_pointer,
    size_t random_values_dim_c0,
    size_t random_values_dim_c1,
    size_t random_values_dim_c2,
    int64_t* output_pointer,
    size_t output_dim_c0,
    size_t output_dim_c1,
    size_t output_dim_c2,
    size_t x_dim,
    size_t y_dim,
    size_t spike_dim,
    size_t h_dim,
    size_t pattern_id)
 {
    size_t counter;
    size_t counter_x = 0;
    size_t counter_y = 0;
    float* p_ptr = nullptr;
    int64_t* out_ptr = nullptr;
    float* rand_ptr = nullptr;
    for (counter_x = 0; counter_x < x_dim; counter_x++)
    {
        for (counter_y = 0; counter_y < y_dim; counter_y++)
        {
            p_ptr = input_pointer + pattern_id * input_dim_c0 +
                counter_x * input_dim_c2 + counter_y;
            // + counter * input_dim_c1
            out_ptr = output_pointer + pattern_id * output_dim_c0 +
                counter_x * output_dim_c2 + counter_y;
            // + counter * output_dim_c1
            rand_ptr = random_values_pointer +
                pattern_id * random_values_dim_c0 +
                counter_x * random_values_dim_c2 + counter_y;
            // + counter * random_values_dim_c1
            for (counter = 0; counter < spike_dim; counter++)
            {
                out_ptr[counter * output_dim_c1] = lower_bound(p_ptr,
                    h_dim,
                    input_dim_c1,
                    rand_ptr[counter * random_values_dim_c1]);
            }
        }
    }
    return true;
 };
 // algorithmic idea stolen from libc++
 size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr,
    size_t data_length,
    size_t data_ptr_stride,
    float compare_to_value)
 {
    size_t start_of_range = 0;
    size_t length_of_range = data_length;
    while (length_of_range != 0)
    {
        size_t half_length = length_of_range >> 1;
        size_t actual_position = start_of_range + half_length;
        if (data_ptr[actual_position * data_ptr_stride] < compare_to_value)
        {
            start_of_range = ++actual_position;
            length_of_range -= half_length + 1;
        }
        else
            length_of_range = half_length;
    }
    return start_of_range;
 };
--- a/network/CPP_NoCuda/SpikeGeneration2DManyIP.h
+++ b/network/CPP_NoCuda/SpikeGeneration2DManyIP.h
@ -0,0 +1,49 @@
 #ifndef SRC_SPIKEGENERATION2DMANYIP_H_
 #define SRC_SPIKEGENERATION2DMANYIP_H_
 #include <unistd.h>
 #include <cctype>
 #include <iostream>
 class SpikeGeneration2DManyIP
 {
    public:
    SpikeGeneration2DManyIP();
    ~SpikeGeneration2DManyIP();
    bool spike_generation_entrypoint(
        int64_t input_pointer_addr, int64_t input_dim_0,
        int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
        int64_t random_values_pointer_addr, int64_t random_values_dim_0,
        int64_t random_values_dim_1, int64_t random_values_dim_2,
        int64_t random_values_dim_3, int64_t output_pointer_addr,
        int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
        int64_t output_dim_3, int64_t number_of_cpu_processes);
    bool spike_generation(
        float* input_pointer,
        size_t input_dim_c0,
        size_t input_dim_c1,
        size_t input_dim_c2,
        float* random_values_pointer,
        size_t random_values_dim_c0,
        size_t random_values_dim_c1,
        size_t random_values_dim_c2,
        int64_t* output_pointer,
        size_t output_dim_c0,
        size_t output_dim_c1,
        size_t output_dim_c2,
        size_t x_dim,
        size_t y_dim,
        size_t spike_dim,
        size_t h_dim,
        size_t pattern_id);
    private:
    size_t lower_bound(float* data_ptr, size_t data_length,
        size_t data_ptr_stride,
        float compare_to_value);
 };
 #endif /* SRC_SPIKEGENERATION2DMANYIP_H_ */
--- a/network/CPP_NoCuda/approximation_multiplication_function.cpp
+++ b/network/CPP_NoCuda/approximation_multiplication_function.cpp
@ -0,0 +1,138 @@
 #include <unistd.h>
 #include <bitset>
 #include <cassert>
 #include <cctype>
 #include "error_term.cpp"
 // Best way to plot the bits
 // std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n";
 // The result needs to be written back into h_pointer (which contains h)
 // Don't write to w_pointer.
 void approximation_multiplication_function(
    float *h_pointer, float *w_pointer, int64_t pattern_length,
    uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits,
    uint32_t *ap_x_ptr, uint32_t *ap_y_ptr, uint32_t *ap_x_exponent_ptr,
    uint32_t *ap_y_exponent_ptr, uint32_t *ap_h_exponent_ptr, uint32_t ap_mask,
    uint64_t *ap_res_ptr, uint32_t *sign_temp_ptr, bool approximation_enable) {
  uint64_t counter;
  uint32_t *w_pointer_mod = (uint32_t *)w_pointer;
  uint32_t *h_pointer_mod = (uint32_t *)h_pointer;
 //  Calculate the new sign
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^
                             (h_pointer_mod[counter] & 0x80000000);
  }
 // Extract the exponent
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24;
  }
  // Cast and "normalize"
  uint64_t shift_value = 32 - number_of_frac_bits;
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_x_ptr[counter] =
        ((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_y_ptr[counter] =
        ((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
  }
 // Make the zero -g-r-e-a-t- correct again
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    if (h_pointer[counter] == 0) {
      ap_x_ptr[counter] = 0;
    }
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    if (w_pointer[counter] == 0) {
      ap_y_ptr[counter] = 0;
    }
  }
 // res = x*y
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_res_ptr[counter] = static_cast<uint64_t>(ap_x_ptr[counter]) * static_cast<uint64_t>(ap_y_ptr[counter]);
  }
  uint32_t temp;
  if (approximation_enable == true){
    // Go through the vector values
    for (counter = 0; counter < pattern_length; counter++) {
      temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask,
                      number_of_trunc_bits);
      if (temp > ap_res_ptr[counter]) {
        ap_res_ptr[counter] = 0;
      } else {
        ap_res_ptr[counter] -= temp;
      }
    }
  }
 // Cast from int to float
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer[counter] = static_cast<float>(ap_res_ptr[counter]);
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
  }
 // devide by the 2^number_of_frac_bits
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits;
  }
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] +
           ap_h_exponent_ptr[counter];
    if (temp > 252) {
      ap_h_exponent_ptr[counter] = temp - 252;
    } else {
      // Here I try to catch the case that the new exponent is too small
      ap_h_exponent_ptr[counter] = 0;
    }
  }
 // Remove the old exponent
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9;
  }
 // Install the new exponent
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23;
  }
 // Add the sign back
 #pragma omp simd
  for (counter = 0; counter < pattern_length; counter++) {
    h_pointer_mod[counter] += sign_temp_ptr[counter];
  }
  return;
 }
--- a/network/CPP_NoCuda/error_term.cpp
+++ b/network/CPP_NoCuda/error_term.cpp
@ -0,0 +1,28 @@
 #include <unistd.h>
 #include <cassert>
 #include <cctype>
 uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask,
                    uint32_t number_of_trunc_bits) {
  uint32_t error_value = 0;
  uint32_t temp_shift_a = a;
  uint32_t temp_shift_b = b & ap_mask;
  uint32_t counter_trunc;
  uint32_t temp;
  // Go through the bits
  for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
       counter_trunc++) {
    temp = temp_shift_a & 1;
    if (temp == 1) {
      error_value += temp_shift_b & ap_mask;
    }
    temp_shift_a >>= 1;
    temp_shift_b <<= 1;
  }
  return error_value;
 }
--- a/network/CPP_NoCuda/pybind11_auto_pyi.py
+++ b/network/CPP_NoCuda/pybind11_auto_pyi.py
@ -0,0 +1,23 @@
 # %%
 # pip install pybind11-stubgen
 from pybind11_stubgen import ModuleStubsGenerator  # type: ignore
 import glob
 def process(module_name: str) -> None:
    module = ModuleStubsGenerator(module_name)
    module.parse()
    module.write_setup_py = False
    with open(module_name + ".pyi", "w") as fp:
        fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n")
        fp.write("\n".join(module.to_lines()))
 Files = glob.glob("*.so")
 for fid in Files:
    Idx: int = fid.find(".")
    module_name: str = fid[:Idx]
    print("Processing: " + module_name)
    process(module_name)
--- a/network/Conv2dApproximation.py
+++ b/network/Conv2dApproximation.py
@ -0,0 +1,318 @@
 import torch
 import math
 from network.CPP.PyMultiApp import MultiApp
 class Conv2dApproximation(torch.nn.Module):
    in_channels: int | None = None
    out_channels: int | None = None
    kernel_size: list[int] | None = None
    stride: list[int] = [1, 1]
    padding: list[int] = [0, 0]
    dilation: list[int] = [1, 1]
    use_bias: bool = False
    approximation_enable: bool = False
    number_of_trunc_bits: int = -1
    number_of_frac: int = -1
    number_of_processes: int = 1
    weights: torch.nn.parameter.Parameter
    bias: torch.nn.parameter.Parameter | None
    device: torch.device
    dtype: torch.dtype
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: list[int],
        stride: list[int] = [1, 1],
        padding: list[int] = [0, 0],
        dilation: list[int] = [1, 1],
        bias: bool = True,
        approximation_enable: bool = False,
        number_of_trunc_bits: int = -1,
        number_of_frac: int = -1,
        number_of_processes: int = 1,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
    ) -> None:
        super().__init__()
        assert device is not None
        self.device = device
        assert dtype is not None
        self.dtype = dtype
        assert len(kernel_size) == 2
        assert len(stride) == 2
        assert len(padding) == 2
        assert len(dilation) == 2
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.use_bias = bias
        self.number_of_processes = number_of_processes
        self.approximation_enable = approximation_enable
        self.number_of_trunc_bits = number_of_trunc_bits
        self.number_of_frac = number_of_frac
        if self.use_bias is True:
            self.bias: torch.nn.parameter.Parameter | None = (
                torch.nn.parameter.Parameter(
                    torch.empty(
                        (out_channels),
                        dtype=self.dtype,
                        device=self.device,
                    )
                )
            )
        else:
            self.bias = None
        self.weights: torch.nn.parameter.Parameter = torch.nn.parameter.Parameter(
            torch.empty(
                (out_channels, in_channels, *kernel_size),
                dtype=self.dtype,
                device=self.device,
            )
        )
        self.functional_multi = FunctionalMultiConv2d.apply
        self.reset_parameters()
    def reset_parameters(self) -> None:
        # Stolen from original torch conv2 code
        torch.nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights)
            if fan_in != 0:
                bound = 1 / math.sqrt(fan_in)
                torch.nn.init.uniform_(self.bias, -bound, bound)
    def calculate_output_size(self, value: torch.Tensor) -> None:
        coordinates_0, coordinates_1 = self._get_coordinates(value)
        self.output_size: torch.Tensor = torch.tensor(
            [
                coordinates_0.shape[1],
                coordinates_1.shape[1],
            ],
            dtype=torch.int64,
        )
        self.output_size.requires_grad_(False)
    def _get_coordinates(
        self, value: torch.Tensor
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Function converts parameter in coordinates
        for the convolution window"""
        assert value is not None
        assert torch.is_tensor(value) is True
        assert value.dim() == 1
        assert torch.numel(value) == 2
        assert value.dtype == torch.int64
        assert value[0] > 0
        assert value[1] > 0
        assert self.kernel_size is not None
        assert len(self.kernel_size) == 2
        assert len(self.stride) == 2
        assert len(self.dilation) == 2
        assert len(self.padding) == 2
        unfold_0: torch.nn.Unfold = torch.nn.Unfold(
            kernel_size=(int(self.kernel_size[0]), 1),
            dilation=int(self.dilation[0]),
            padding=int(self.padding[0]),
            stride=int(self.stride[0]),
        )
        unfold_1: torch.nn.Unfold = torch.nn.Unfold(
            kernel_size=(1, int(self.kernel_size[1])),
            dilation=int(self.dilation[1]),
            padding=int(self.padding[1]),
            stride=int(self.stride[1]),
        )
        coordinates_0: torch.Tensor = (
            unfold_0(
                torch.unsqueeze(
                    torch.unsqueeze(
                        torch.unsqueeze(
                            torch.arange(0, int(value[0]), dtype=torch.float32),
                            1,
                        ),
                        0,
                    ),
                    0,
                )
            )
            .squeeze(0)
            .type(torch.int64)
        )
        coordinates_1: torch.Tensor = (
            unfold_1(
                torch.unsqueeze(
                    torch.unsqueeze(
                        torch.unsqueeze(
                            torch.arange(0, int(value[1]), dtype=torch.float32),
                            0,
                        ),
                        0,
                    ),
                    0,
                )
            )
            .squeeze(0)
            .type(torch.int64)
        )
        return coordinates_0, coordinates_1
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.dim() == 4
        assert self.kernel_size is not None
        input_size = torch.Tensor([int(input.shape[-2]), int(input.shape[-1])]).type(
            dtype=torch.int64
        )
        self.calculate_output_size(input_size)
        input_fold = torch.nn.functional.fold(
            torch.nn.functional.unfold(
                input.requires_grad_(True),
                tuple(self.kernel_size),
                tuple(self.dilation),
                tuple(self.padding),
                tuple(self.stride),
            ),
            output_size=(int(self.output_size[0]), int(self.output_size[1])),
            kernel_size=(1, 1),
            dilation=(1, 1),
            padding=(0, 0),
            stride=(1, 1),
        )
        weights_fold = torch.nn.functional.unfold(
            self.weights.requires_grad_(True),
            tuple(self.kernel_size),
            tuple(self.dilation),
            tuple(self.padding),
            tuple(self.stride),
        ).squeeze(-1)
        if input.device == torch.device("cpu"):
            number_of_cpu_processes: int = int(self.number_of_processes)
        else:
            number_of_cpu_processes = -1
        # Here...
        parameter_list = torch.tensor(
            [
                int(self.approximation_enable),  # 0
                int(self.number_of_trunc_bits),  # 1
                int(self.number_of_frac),  # 2
                int(number_of_cpu_processes),  # 3
            ],
            dtype=torch.int64,
        )
        output = self.functional_multi(input_fold, weights_fold, parameter_list)
        if self.bias is not None:
            output += self.bias.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
        return output
 class FunctionalMultiConv2d(torch.autograd.Function):
    @staticmethod
    def forward(  # type: ignore
        ctx,
        input: torch.Tensor,
        weights: torch.Tensor,
        parameter_list: torch.Tensor,
    ) -> torch.Tensor:
        assert input.ndim == 4
        assert input.dtype is torch.float32
        assert input.is_contiguous() is True
        assert weights.ndim == 2
        assert weights.dtype is torch.float32
        assert weights.is_contiguous() is True
        assert input.shape[1] == weights.shape[1]
        approximation_enable = bool(parameter_list[0])
        number_of_trunc_bits = int(parameter_list[1])
        number_of_frac = int(parameter_list[2])
        number_of_processes = int(parameter_list[3])
        assert input.device == weights.device
        output = torch.empty(
            (input.shape[0], weights.shape[0], input.shape[2], input.shape[3]),
            dtype=weights.dtype,
            device=weights.device,
            requires_grad=True,
        )
        assert output.is_contiguous() is True
        multiplier: MultiApp = MultiApp()
        multiplier.update_with_init_vector_multi_pattern(
            input.data_ptr(),
            weights.data_ptr(),
            output.data_ptr(),
            int(output.shape[0]),  # pattern
            int(output.shape[1]),  # feature channel
            int(output.shape[2]),  # x
            int(output.shape[3]),  # y
            int(input.shape[1]),  # input channel
            int(number_of_processes),
            bool(approximation_enable),
            int(number_of_trunc_bits),
            int(number_of_frac),
        )
        ctx.save_for_backward(
            input.detach(),
            weights.detach(),
        )
        return output
    @staticmethod
    def backward(ctx, grad_output):
        (input, weights) = ctx.saved_tensors
        grad_input = (
            grad_output.unsqueeze(2) * weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
        ).sum(1)
        grad_weights = (
            (grad_output.unsqueeze(2) * input.unsqueeze(1)).sum(0).sum(-1).sum(-1)
        )
        grad_parameter_list = None
        return (grad_input, grad_weights, grad_parameter_list)
--- a/network/Dataset.py
+++ b/network/Dataset.py
@ -0,0 +1,317 @@
 from abc import ABC, abstractmethod
 import torch
 import numpy as np
 import torchvision as tv  # type: ignore
 from network.Parameter import Config
 class DatasetMaster(torch.utils.data.Dataset, ABC):
    path_label: str
    label_storage: np.ndarray
    pattern_storage: np.ndarray
    number_of_pattern: int
    mean: list[float]
    # Initialize
    def __init__(
        self,
        train: bool = False,
        path_pattern: str = "./",
        path_label: str = "./",
    ) -> None:
        super().__init__()
        if train is True:
            self.label_storage = np.load(path_label + "/TrainLabelStorage.npy")
        else:
            self.label_storage = np.load(path_label + "/TestLabelStorage.npy")
        if train is True:
            self.pattern_storage = np.load(path_pattern + "/TrainPatternStorage.npy")
        else:
            self.pattern_storage = np.load(path_pattern + "/TestPatternStorage.npy")
        self.number_of_pattern = self.label_storage.shape[0]
        self.mean = []
    def __len__(self) -> int:
        return self.number_of_pattern
    # Get one pattern at position index
    @abstractmethod
    def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
        pass
    @abstractmethod
    def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        pass
    @abstractmethod
    def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        pass
 class DatasetMNIST(DatasetMaster):
    """Contstructor"""
    # Initialize
    def __init__(
        self,
        train: bool = False,
        path_pattern: str = "./",
        path_label: str = "./",
    ) -> None:
        super().__init__(train, path_pattern, path_label)
        self.pattern_storage = np.ascontiguousarray(
            self.pattern_storage[:, np.newaxis, :, :].astype(dtype=np.float32)
        )
        self.pattern_storage /= np.max(self.pattern_storage)
        mean = self.pattern_storage.mean(3).mean(2).mean(0)
        self.mean = [*mean]
    def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
        image = self.pattern_storage[index, 0:1, :, :]
        target = int(self.label_storage[index])
        return torch.tensor(image), target
    def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        """0. The test image comes in
        1. is center cropped
        2. returned.
        This is a 1 channel version (e.g. one gray channel).
        """
        assert len(cfg.image_statistics.mean) == 1
        assert len(cfg.image_statistics.the_size) == 2
        assert cfg.image_statistics.the_size[0] > 0
        assert cfg.image_statistics.the_size[1] > 0
        # Transformation chain
        my_transforms: torch.nn.Sequential = torch.nn.Sequential(
            tv.transforms.CenterCrop(size=cfg.image_statistics.the_size),
        )
        scripted_transforms = torch.jit.script(my_transforms)
        # Preprocess the input data
        pattern = scripted_transforms(pattern)
        gray = pattern[:, 0:1, :, :] + 1e-20
        return gray
    def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        """0. The training image comes in
        1. is cropped from a random position
        2. returned.
        This is a 1 channel version (e.g. one gray channel).
        """
        assert len(cfg.image_statistics.mean) == 1
        assert len(cfg.image_statistics.the_size) == 2
        assert cfg.image_statistics.the_size[0] > 0
        assert cfg.image_statistics.the_size[1] > 0
        # Transformation chain
        my_transforms: torch.nn.Sequential = torch.nn.Sequential(
            tv.transforms.RandomCrop(size=cfg.image_statistics.the_size),
        )
        scripted_transforms = torch.jit.script(my_transforms)
        # Preprocess the input data
        pattern = scripted_transforms(pattern)
        gray = pattern[:, 0:1, :, :] + 1e-20
        return gray
 class DatasetFashionMNIST(DatasetMaster):
    """Contstructor"""
    # Initialize
    def __init__(
        self,
        train: bool = False,
        path_pattern: str = "./",
        path_label: str = "./",
    ) -> None:
        super().__init__(train, path_pattern, path_label)
        self.pattern_storage = np.ascontiguousarray(
            self.pattern_storage[:, np.newaxis, :, :].astype(dtype=np.float32)
        )
        self.pattern_storage /= np.max(self.pattern_storage)
        mean = self.pattern_storage.mean(3).mean(2).mean(0)
        self.mean = [*mean]
    def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
        image = self.pattern_storage[index, 0:1, :, :]
        target = int(self.label_storage[index])
        return torch.tensor(image), target
    def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        """0. The test image comes in
        1. is center cropped
        2. returned.
        This is a 1 channel version (e.g. one gray channel).
        """
        assert len(cfg.image_statistics.mean) == 1
        assert len(cfg.image_statistics.the_size) == 2
        assert cfg.image_statistics.the_size[0] > 0
        assert cfg.image_statistics.the_size[1] > 0
        # Transformation chain
        my_transforms: torch.nn.Sequential = torch.nn.Sequential(
            tv.transforms.CenterCrop(size=cfg.image_statistics.the_size),
        )
        scripted_transforms = torch.jit.script(my_transforms)
        # Preprocess the input data
        pattern = scripted_transforms(pattern)
        gray = pattern[:, 0:1, :, :] + 1e-20
        return gray
    def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        """0. The training image comes in
        1. is cropped from a random position
        2. returned.
        This is a 1 channel version (e.g. one gray channel).
        """
        assert len(cfg.image_statistics.mean) == 1
        assert len(cfg.image_statistics.the_size) == 2
        assert cfg.image_statistics.the_size[0] > 0
        assert cfg.image_statistics.the_size[1] > 0
        # Transformation chain
        my_transforms: torch.nn.Sequential = torch.nn.Sequential(
            tv.transforms.RandomCrop(size=cfg.image_statistics.the_size),
            tv.transforms.RandomHorizontalFlip(p=cfg.augmentation.flip_p),
            tv.transforms.ColorJitter(
                brightness=cfg.augmentation.jitter_brightness,
                contrast=cfg.augmentation.jitter_contrast,
                saturation=cfg.augmentation.jitter_saturation,
                hue=cfg.augmentation.jitter_hue,
            ),
        )
        scripted_transforms = torch.jit.script(my_transforms)
        # Preprocess the input data
        pattern = scripted_transforms(pattern)
        gray = pattern[:, 0:1, :, :] + 1e-20
        return gray
 class DatasetCIFAR(DatasetMaster):
    """Contstructor"""
    # Initialize
    def __init__(
        self,
        train: bool = False,
        path_pattern: str = "./",
        path_label: str = "./",
    ) -> None:
        super().__init__(train, path_pattern, path_label)
        self.pattern_storage = np.ascontiguousarray(
            np.moveaxis(self.pattern_storage.astype(dtype=np.float32), 3, 1)
        )
        self.pattern_storage /= np.max(self.pattern_storage)
        mean = self.pattern_storage.mean(3).mean(2).mean(0)
        self.mean = [*mean]
    def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
        image = self.pattern_storage[index, :, :, :]
        target = int(self.label_storage[index])
        return torch.tensor(image), target
    def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        """0. The test image comes in
        1. is center cropped
        2. returned.
        This is a 3 channel version (e.g. r,g,b channels).
        """
        assert len(cfg.image_statistics.mean) == 3
        assert len(cfg.image_statistics.the_size) == 2
        assert cfg.image_statistics.the_size[0] > 0
        assert cfg.image_statistics.the_size[1] > 0
        # Transformation chain
        my_transforms: torch.nn.Sequential = torch.nn.Sequential(
            tv.transforms.CenterCrop(size=cfg.image_statistics.the_size),
        )
        scripted_transforms = torch.jit.script(my_transforms)
        # Preprocess the input data
        pattern = scripted_transforms(pattern)
        r = pattern[:, 0:1, :, :] + 1e-20
        g = pattern[:, 1:2, :, :] + 1e-20
        b = pattern[:, 2:3, :, :] + 1e-20
        new_tensor: torch.Tensor = torch.cat((r, g, b), dim=1)
        return new_tensor
    def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
        """0. The training image comes in
        1. is cropped from a random position
        2. is randomly horizontally flipped
        3. is randomly color jitteres
        4. returned.
        This is a 3 channel version (e.g. r,g,b channels).
        """
        assert len(cfg.image_statistics.mean) == 3
        assert len(cfg.image_statistics.the_size) == 2
        assert cfg.image_statistics.the_size[0] > 0
        assert cfg.image_statistics.the_size[1] > 0
        # Transformation chain
        my_transforms: torch.nn.Sequential = torch.nn.Sequential(
            tv.transforms.RandomCrop(size=cfg.image_statistics.the_size),
            tv.transforms.RandomHorizontalFlip(p=cfg.augmentation.flip_p),
            tv.transforms.ColorJitter(
                brightness=cfg.augmentation.jitter_brightness,
                contrast=cfg.augmentation.jitter_contrast,
                saturation=cfg.augmentation.jitter_saturation,
                hue=cfg.augmentation.jitter_hue,
            ),
        )
        scripted_transforms = torch.jit.script(my_transforms)
        # Preprocess the input data
        pattern = scripted_transforms(pattern)
        r = pattern[:, 0:1, :, :] + 1e-20
        g = pattern[:, 1:2, :, :] + 1e-20
        b = pattern[:, 2:3, :, :] + 1e-20
        new_tensor: torch.Tensor = torch.cat((r, g, b), dim=1)
        return new_tensor
 if __name__ == "__main__":
    pass
--- a/network/Parameter.py
+++ b/network/Parameter.py
@ -0,0 +1,185 @@
 # %%
 from dataclasses import dataclass, field
 import numpy as np
 import torch
 import os
@dataclass
 class Network:
    """Parameters of the network. The details about
    its layers and the number of output neurons."""
    layer_type: list[str] = field(default_factory=list)
    forward_neuron_numbers: list[list[int]] = field(default_factory=list)
    forward_kernel_size: list[list[int]] = field(default_factory=list)
    strides: list[list[int]] = field(default_factory=list)
    dilation: list[list[int]] = field(default_factory=list)
    padding: list[list[int]] = field(default_factory=list)
    number_of_output_neurons: int = field(default=0)
@dataclass
 class LearningParameters:
    """Parameter required for training"""
    learning_active: bool = field(default=True)
    loss_mode: int = field(default=0)
    loss_coeffs_mse: float = field(default=0.5)
    loss_coeffs_kldiv: float = field(default=1.0)
    optimizer_name: str = field(default="Adam")
    learning_rate_gamma_w: float = field(default=-1.0)
    learning_rate_threshold_w: float = field(default=0.00001)
    lr_schedule_name: str = field(default="ReduceLROnPlateau")
    lr_scheduler_use_performance: bool = field(default=False)
    lr_scheduler_factor_w: float = field(default=0.75)
    lr_scheduler_patience_w: int = field(default=-1)
    lr_scheduler_tau_w: int = field(default=10)
    number_of_batches_for_one_update: int = field(default=1)
    overload_path: str = field(default="Previous")
    weight_noise_range: list[float] = field(default_factory=list)
    eps_xy_intitial: float = field(default=0.1)
    # disable_scale_grade: bool = field(default=False)
    # kepp_last_grad_scale: bool = field(default=True)
    sbs_skip_gradient_calculation: list[bool] = field(default_factory=list)
    adapt_learning_rate_after_minibatch: bool = field(default=True)
    w_trainable: list[bool] = field(default_factory=list)
@dataclass
 class Augmentation:
    """Parameters used for data augmentation."""
    crop_width_in_pixel: int = field(default=2)
    flip_p: float = field(default=0.5)
    jitter_brightness: float = field(default=0.5)
    jitter_contrast: float = field(default=0.1)
    jitter_saturation: float = field(default=0.1)
    jitter_hue: float = field(default=0.15)
@dataclass
 class ImageStatistics:
    """(Statistical) information about the input. i.e.
    mean values and the x and y size of the input"""
    mean: list[float] = field(default_factory=list)
    the_size: list[int] = field(default_factory=list)
@dataclass
 class ApproximationSetting:
    # Approximation CONV2D Layer
    approximation_enable: list[bool] = field(default_factory=list)
    number_of_trunc_bits: list[int] = field(default_factory=list)
    number_of_frac_bits: list[int] = field(default_factory=list)
@dataclass
 class Config:
    """Master config class."""
    # Sub classes
    network_structure: Network = field(default_factory=Network)
    learning_parameters: LearningParameters = field(default_factory=LearningParameters)
    augmentation: Augmentation = field(default_factory=Augmentation)
    image_statistics: ImageStatistics = field(default_factory=ImageStatistics)
    approximation_setting: ApproximationSetting = field(
        default_factory=ApproximationSetting
    )
    # For labeling simulations
    # (not actively used)
    simulation_id: int = field(default=0)
    stage_id: int = field(default=-1)
    # Size of one sub-mini-batch
    # (the number of pattern processed at the same time)
    batch_size: int = field(default=500)
    # The data set
    # Identifier for Dataset.oy
    data_mode: str = field(default="")
    # The path to the data set
    data_path: str = field(default="")
    # The epochs identifier
    epoch_id: int = field(default=0)
    # Maximum number of epochs
    epoch_id_max: int = field(default=10000)
    # Number of cpu threads
    number_of_cpu_processes: int = field(default=-1)
    # Adjust the number of pattern processed in
    # one step to the amount of core or with HT threads
    # of the cpu
    enable_cpu_thread_balacing: bool = field(default=True)
    # Path for storing information
    weight_path: str = field(default="Parameters")
    log_path: str = field(default="Log")
    # Other SbS Settings
    number_of_spikes: list[int] = field(default_factory=list)
    cooldown_after_number_of_spikes: int = field(default=-1)
    reduction_cooldown: float = field(default=25.0)
    epsilon_0: float = field(default=1.0)
    forgetting_offset: float = field(default=-1.0)
    def __post_init__(self) -> None:
        """Post init determines the number of cores.
        Creates the required directory and gives us an optimized
        (for the amount of cores) batch size."""
        number_of_cpu_processes_temp = os.cpu_count()
        if self.number_of_cpu_processes < 1:
            if number_of_cpu_processes_temp is None:
                self.number_of_cpu_processes = 1
            else:
                self.number_of_cpu_processes = number_of_cpu_processes_temp
        os.makedirs(self.weight_path, exist_ok=True)
        if self.enable_cpu_thread_balacing is True:
            self.batch_size = (
                self.batch_size // self.number_of_cpu_processes
            ) * self.number_of_cpu_processes
            self.batch_size = np.max((self.batch_size, self.number_of_cpu_processes))
        self.batch_size = int(self.batch_size)
    def get_epsilon_t(self, number_of_spikes: int):
        """Generates the time series of the basic epsilon."""
        t = np.arange(0, number_of_spikes, dtype=np.float32) + 1
        np_epsilon_t: np.ndarray = t ** (
            -1.0 / 2.0
        )  # np.ones((number_of_spikes), dtype=np.float32)
        if (self.cooldown_after_number_of_spikes < number_of_spikes) and (
            self.cooldown_after_number_of_spikes >= 0
        ):
            np_epsilon_t[
                self.cooldown_after_number_of_spikes : number_of_spikes
            ] /= self.reduction_cooldown
        return torch.tensor(np_epsilon_t)
    def get_update_after_x_pattern(self):
        """Tells us after how many pattern we need to update the weights."""
        return (
            self.batch_size * self.learning_parameters.number_of_batches_for_one_update
        )
--- a/network/SbS.py
+++ b/network/SbS.py
@ -0,0 +1,714 @@
 import torch
 from network.CPP.PySpikeGeneration2DManyIP import SpikeGeneration2DManyIP
 from network.CPP.PyHDynamicCNNManyIP import HDynamicCNNManyIP
 from network.calculate_output_size import calculate_output_size
 class SbS(torch.nn.Module):
    _epsilon_xy: torch.Tensor | None = None
    _epsilon_0: float
    _epsilon_t: torch.Tensor | None = None
    _weights: torch.nn.parameter.Parameter
    _weights_exists: bool = False
    _kernel_size: list[int]
    _stride: list[int]
    _dilation: list[int]
    _padding: list[int]
    _output_size: torch.Tensor
    _number_of_spikes: int
    _number_of_cpu_processes: int
    _number_of_neurons: int
    _number_of_input_neurons: int
    _epsilon_xy_intitial: float
    _h_initial: torch.Tensor | None = None
    _w_trainable: bool
    #    _last_grad_scale: torch.nn.parameter.Parameter
    #    _keep_last_grad_scale: bool
    #    _disable_scale_grade: bool
    _forgetting_offset: torch.Tensor | None = None
    _weight_noise_range: list[float]
    _skip_gradient_calculation: bool
    _is_pooling_layer: bool
    _input_size: list[int]
    _output_layer: bool = False
    _local_learning: bool = False
    device: torch.device
    default_dtype: torch.dtype
    _gpu_tuning_factor: int
    _max_grad_weights: torch.Tensor | None = None
    _number_of_grad_weight_contributions: float = 0.0
    def __init__(
        self,
        number_of_input_neurons: int,
        number_of_neurons: int,
        input_size: list[int],
        forward_kernel_size: list[int],
        number_of_spikes: int,
        epsilon_t: torch.Tensor,
        epsilon_xy_intitial: float = 0.1,
        epsilon_0: float = 1.0,
        weight_noise_range: list[float] = [0.0, 1.0],
        is_pooling_layer: bool = False,
        strides: list[int] = [1, 1],
        dilation: list[int] = [0, 0],
        padding: list[int] = [0, 0],
        number_of_cpu_processes: int = 1,
        w_trainable: bool = False,
        #        keep_last_grad_scale: bool = False,
        #        disable_scale_grade: bool = True,
        forgetting_offset: float = -1.0,
        skip_gradient_calculation: bool = False,
        device: torch.device | None = None,
        default_dtype: torch.dtype | None = None,
        gpu_tuning_factor: int = 5,
    ) -> None:
        super().__init__()
        assert device is not None
        assert default_dtype is not None
        self.device = device
        self.default_dtype = default_dtype
        self._w_trainable = bool(w_trainable)
        #        self._keep_last_grad_scale = bool(keep_last_grad_scale)
        self._skip_gradient_calculation = bool(skip_gradient_calculation)
        #        self._disable_scale_grade = bool(disable_scale_grade)
        self._epsilon_xy_intitial = float(epsilon_xy_intitial)
        self._stride = strides
        self._dilation = dilation
        self._padding = padding
        self._kernel_size = forward_kernel_size
        self._number_of_input_neurons = int(number_of_input_neurons)
        self._number_of_neurons = int(number_of_neurons)
        self._epsilon_0 = float(epsilon_0)
        self._number_of_cpu_processes = int(number_of_cpu_processes)
        self._number_of_spikes = int(number_of_spikes)
        self._weight_noise_range = weight_noise_range
        self._is_pooling_layer = bool(is_pooling_layer)
        assert len(input_size) == 2
        self._input_size = input_size
        # The GPU hates me...
        # Too many SbS threads == bad
        # Thus I need to limit them...
        # (Reminder: We cannot access the mini-batch size here,
        # which is part of the GPU thread size calculation...)
        if (self._input_size[0] * self._input_size[1]) > gpu_tuning_factor:
            self._gpu_tuning_factor = gpu_tuning_factor
        else:
            self._gpu_tuning_factor = 0
        # self._last_grad_scale = torch.nn.parameter.Parameter(
        #     torch.tensor(-1.0, dtype=self.default_dtype),
        #     requires_grad=True,
        # )
        self._forgetting_offset = torch.tensor(
            forgetting_offset, dtype=self.default_dtype, device=self.device
        )
        self.epsilon_t = epsilon_t.type(dtype=self.default_dtype).to(device=self.device)
        self._output_size = calculate_output_size(
            value=input_size,
            kernel_size=self._kernel_size,
            stride=self._stride,
            dilation=self._dilation,
            padding=self._padding,
        )
        self.set_h_init_to_uniform()
        self.functional_sbs = FunctionalSbS.apply
        # ###############################################################
        # Initialize the weights
        # ###############################################################
        if self._is_pooling_layer is True:
            self.weights = self._make_pooling_weights()
        else:
            assert len(self._weight_noise_range) == 2
            weights = torch.empty(
                (
                    int(self._kernel_size[0])
                    * int(self._kernel_size[1])
                    * int(self._number_of_input_neurons),
                    int(self._number_of_neurons),
                ),
                dtype=self.default_dtype,
                device=self.device,
            )
            torch.nn.init.uniform_(
                weights,
                a=float(self._weight_noise_range[0]),
                b=float(self._weight_noise_range[1]),
            )
            self.weights = weights
    ####################################################################
    # Variables in and out                                             #
    ####################################################################
    @property
    def epsilon_t(self) -> torch.Tensor | None:
        return self._epsilon_t
    @epsilon_t.setter
    def epsilon_t(self, value: torch.Tensor):
        assert value is not None
        assert torch.is_tensor(value) is True
        assert value.dim() == 1
        assert value.dtype == self.default_dtype
        self._epsilon_t = (
            value.detach()
            .clone(memory_format=torch.contiguous_format)
            .type(dtype=self.default_dtype)
            .to(device=self.device)
            .requires_grad_(False)
        )
    @property
    def weights(self) -> torch.Tensor | None:
        if self._weights_exists is False:
            return None
        else:
            return self._weights
    @weights.setter
    def weights(self, value: torch.Tensor):
        assert value is not None
        assert torch.is_tensor(value) is True
        assert value.dim() == 2
        temp: torch.Tensor = (
            value.detach()
            .clone(memory_format=torch.contiguous_format)
            .type(dtype=self.default_dtype)
            .to(device=self.device)
        )
        temp /= temp.sum(dim=0, keepdim=True, dtype=self.default_dtype)
        if self._weights_exists is False:
            self._weights = torch.nn.parameter.Parameter(temp, requires_grad=True)
            self._weights_exists = True
        else:
            self._weights.data = temp
    @property
    def h_initial(self) -> torch.Tensor | None:
        return self._h_initial
    @h_initial.setter
    def h_initial(self, value: torch.Tensor):
        assert value is not None
        assert torch.is_tensor(value) is True
        assert value.dim() == 1
        assert value.dtype == self.default_dtype
        self._h_initial = (
            value.detach()
            .clone(memory_format=torch.contiguous_format)
            .type(dtype=self.default_dtype)
            .to(device=self.device)
            .requires_grad_(False)
        )
    def update_pre_care(self):
        if self._weights.grad is not None:
            assert self._number_of_grad_weight_contributions > 0
            self._weights.grad /= self._number_of_grad_weight_contributions
            self._number_of_grad_weight_contributions = 0.0
    def update_after_care(self, threshold_weight: float):
        if self._w_trainable is True:
            self.norm_weights()
            self.threshold_weights(threshold_weight)
            self.norm_weights()
    # def after_batch(self, new_state: bool = False):
    #     if self._keep_last_grad_scale is True:
    #         self._last_grad_scale.data = self._last_grad_scale.grad
    #         self._keep_last_grad_scale = new_state
    #     self._last_grad_scale.grad = torch.zeros_like(self._last_grad_scale.grad)
    ####################################################################
    # Helper functions                                                 #
    ####################################################################
    def _make_pooling_weights(self) -> torch.Tensor:
        """For generating the pooling weights."""
        assert self._number_of_neurons is not None
        assert self._kernel_size is not None
        weights: torch.Tensor = torch.zeros(
            (
                int(self._kernel_size[0]),
                int(self._kernel_size[1]),
                int(self._number_of_neurons),
                int(self._number_of_neurons),
            ),
            dtype=self.default_dtype,
            device=self.device,
        )
        for i in range(0, int(self._number_of_neurons)):
            weights[:, :, i, i] = 1.0
        weights = weights.moveaxis(-1, 0).moveaxis(-1, 1)
        weights = torch.nn.functional.unfold(
            input=weights,
            kernel_size=(int(self._kernel_size[0]), int(self._kernel_size[1])),
            dilation=(1, 1),
            padding=(0, 0),
            stride=(1, 1),
        ).squeeze()
        weights = torch.moveaxis(weights, 0, 1)
        return weights
    def set_h_init_to_uniform(self) -> None:
        assert self._number_of_neurons > 2
        self.h_initial: torch.Tensor = torch.full(
            (self._number_of_neurons,),
            (1.0 / float(self._number_of_neurons)),
            dtype=self.default_dtype,
            device=self.device,
        )
    def norm_weights(self) -> None:
        assert self._weights_exists is True
        temp: torch.Tensor = (
            self._weights.data.detach()
            .clone(memory_format=torch.contiguous_format)
            .type(dtype=self.default_dtype)
            .to(device=self.device)
        )
        temp /= temp.sum(dim=0, keepdim=True, dtype=self.default_dtype)
        self._weights.data = temp
    def threshold_weights(self, threshold: float) -> None:
        assert self._weights_exists is True
        assert threshold >= 0
        torch.clamp(
            self._weights.data,
            min=float(threshold),
            max=None,
            out=self._weights.data,
        )
    ####################################################################
    # Forward                                                          #
    ####################################################################
    def forward(
        self, input: torch.Tensor, labels: torch.Tensor | None = None
    ) -> torch.Tensor:
        # Are we happy with the input?
        assert input is not None
        assert torch.is_tensor(input) is True
        assert input.dim() == 4
        assert input.dtype == self.default_dtype
        assert input.shape[1] == self._number_of_input_neurons
        assert input.shape[2] == self._input_size[0]
        assert input.shape[3] == self._input_size[1]
        # Are we happy with the rest of the network?
        assert self._epsilon_0 is not None
        assert self._epsilon_t is not None
        assert self._h_initial is not None
        assert self._forgetting_offset is not None
        assert self._weights_exists is True
        assert self._weights is not None
        input_convolved = torch.nn.functional.fold(
            torch.nn.functional.unfold(
                input.requires_grad_(True),
                kernel_size=(int(self._kernel_size[0]), int(self._kernel_size[1])),
                dilation=(int(self._dilation[0]), int(self._dilation[1])),
                padding=(int(self._padding[0]), int(self._padding[1])),
                stride=(int(self._stride[0]), int(self._stride[1])),
            ),
            output_size=tuple(self._output_size.tolist()),
            kernel_size=(1, 1),
            dilation=(1, 1),
            padding=(0, 0),
            stride=(1, 1),
        )
        epsilon_t_0: torch.Tensor = (
            (self._epsilon_t * self._epsilon_0).type(input.dtype).to(input.device)
        )
        parameter_list = torch.tensor(
            [
                int(self._w_trainable),  # 0
                int(0),  # int(self._disable_scale_grade),  # 1
                int(0),  # int(self._keep_last_grad_scale),  # 2
                int(self._skip_gradient_calculation),  # 3
                int(self._number_of_spikes),  # 4
                int(self._number_of_cpu_processes),  # 5
                int(self._output_size[0]),  # 6
                int(self._output_size[1]),  # 7
                int(self._gpu_tuning_factor),  # 8
                int(self._output_layer),  # 9
                int(self._local_learning),  # 10
            ],
            dtype=torch.int64,
        )
        if self._epsilon_xy is None:
            self._epsilon_xy = torch.full(
                (
                    input_convolved.shape[1],
                    input_convolved.shape[2],
                    input_convolved.shape[3],
                ),
                float(self._epsilon_xy_intitial),
                dtype=self.default_dtype,
                device=self.device,
            )
        assert self._epsilon_xy is not None
        # In the case somebody tried to replace the matrix with wrong dimensions
        assert self._epsilon_xy.shape[0] == input_convolved.shape[1]
        assert self._epsilon_xy.shape[1] == input_convolved.shape[2]
        assert self._epsilon_xy.shape[2] == input_convolved.shape[3]
        # SbS forward functional
        output = self.functional_sbs(
            input_convolved,
            self._epsilon_xy,
            epsilon_t_0,
            self._weights,
            self._h_initial,
            parameter_list,
            # self._last_grad_scale,
            self._forgetting_offset,
        )
        self._number_of_grad_weight_contributions += (
            output.shape[0] * output.shape[-2] * output.shape[-1]
        )
        return output
 class FunctionalSbS(torch.autograd.Function):
    @staticmethod
    def forward(  # type: ignore
        ctx,
        input: torch.Tensor,
        epsilon_xy: torch.Tensor,
        epsilon_t_0: torch.Tensor,
        weights: torch.Tensor,
        h_initial: torch.Tensor,
        parameter_list: torch.Tensor,
        # grad_output_scale: torch.Tensor,
        forgetting_offset: torch.Tensor,
    ) -> torch.Tensor:
        assert input.dim() == 4
        number_of_spikes: int = int(parameter_list[4])
        if input.device == torch.device("cpu"):
            spike_number_of_cpu_processes: int = int(parameter_list[5])
        else:
            spike_number_of_cpu_processes = -1
        if input.device == torch.device("cpu"):
            hdyn_number_of_cpu_processes: int = int(parameter_list[5])
        else:
            hdyn_number_of_cpu_processes = -1
        output_size_0: int = int(parameter_list[6])
        output_size_1: int = int(parameter_list[7])
        gpu_tuning_factor: int = int(parameter_list[8])
        # ###########################################################
        # Spike generation
        # ###########################################################
        # ############################################
        # Normalized cumsum
        # (beware of the pytorch bug! Thus .clone()!)
        # ############################################
        input_cumsum: torch.Tensor = torch.cumsum(input, dim=1, dtype=input.dtype)
        input_cumsum_last: torch.Tensor = input_cumsum[:, -1, :, :].unsqueeze(1).clone()
        input_cumsum /= input_cumsum_last
        # ############################################
        # Get the required random numbers
        # ############################################
        random_values = torch.rand(
            size=[
                input_cumsum.shape[0],
                number_of_spikes,
                input_cumsum.shape[2],
                input_cumsum.shape[3],
            ],
            dtype=input.dtype,
            device=input.device,
        )
        # ############################################
        # Make space for the results
        # ############################################
        spikes = torch.empty_like(random_values, dtype=torch.int64, device=input.device)
        assert input_cumsum.is_contiguous() is True
        assert random_values.is_contiguous() is True
        assert spikes.is_contiguous() is True
        # time_start: float = time.perf_counter()
        spike_generation: SpikeGeneration2DManyIP = SpikeGeneration2DManyIP()
        spike_generation.spike_generation(
            input_cumsum.data_ptr(),
            int(input_cumsum.shape[0]),
            int(input_cumsum.shape[1]),
            int(input_cumsum.shape[2]),
            int(input_cumsum.shape[3]),
            random_values.data_ptr(),
            int(random_values.shape[0]),
            int(random_values.shape[1]),
            int(random_values.shape[2]),
            int(random_values.shape[3]),
            spikes.data_ptr(),
            int(spikes.shape[0]),
            int(spikes.shape[1]),
            int(spikes.shape[2]),
            int(spikes.shape[3]),
            int(spike_number_of_cpu_processes),
        )
        del random_values
        del input_cumsum
        # ###########################################################
        # H dynamic
        # ###########################################################
        assert epsilon_t_0.ndim == 1
        assert epsilon_t_0.shape[0] >= number_of_spikes
        # ############################################
        # Make space for the results
        # ############################################
        output = torch.empty(
            (
                int(input.shape[0]),
                int(weights.shape[1]),
                output_size_0,
                output_size_1,
            ),
            dtype=input.dtype,
            device=input.device,
        )
        assert output.is_contiguous() is True
        assert epsilon_xy.is_contiguous() is True
        assert epsilon_t_0.is_contiguous() is True
        assert weights.is_contiguous() is True
        assert spikes.is_contiguous() is True
        assert h_initial.is_contiguous() is True
        assert epsilon_xy.ndim == 3
        assert weights.ndim == 2
        assert h_initial.ndim == 1
        h_dynamic: HDynamicCNNManyIP = HDynamicCNNManyIP()
        h_dynamic.update(
            output.data_ptr(),
            int(output.shape[0]),
            int(output.shape[1]),
            int(output.shape[2]),
            int(output.shape[3]),
            epsilon_xy.data_ptr(),
            int(epsilon_xy.shape[0]),
            int(epsilon_xy.shape[1]),
            int(epsilon_xy.shape[2]),
            epsilon_t_0.data_ptr(),
            int(epsilon_t_0.shape[0]),
            weights.data_ptr(),
            int(weights.shape[0]),
            int(weights.shape[1]),
            spikes.data_ptr(),
            int(spikes.shape[0]),
            int(spikes.shape[1]),
            int(spikes.shape[2]),
            int(spikes.shape[3]),
            h_initial.data_ptr(),
            int(h_initial.shape[0]),
            hdyn_number_of_cpu_processes,
            float(forgetting_offset.item()),
            int(gpu_tuning_factor),
        )
        del spikes
        # ###########################################################
        # Save the necessary data for the backward pass
        # ###########################################################
        ctx.save_for_backward(
            input,
            weights,
            output,
            parameter_list,
            # grad_output_scale,
        )
        return output
    @staticmethod
    def backward(ctx, grad_output):
        # ##############################################
        # Get the variables back
        # ##############################################
        (
            input,
            weights,
            output,
            parameter_list,
            # last_grad_scale,
        ) = ctx.saved_tensors
        # ##############################################
        # Default output
        # ##############################################
        grad_input = None
        grad_eps_xy = None
        grad_epsilon_t_0 = None
        grad_weights = None
        grad_h_initial = None
        grad_parameter_list = None
        grad_forgetting_offset = None
        # ##############################################
        # Parameters
        # ##############################################
        parameter_w_trainable: bool = bool(parameter_list[0])
        # parameter_disable_scale_grade: bool = bool(parameter_list[1])
        # parameter_keep_last_grad_scale: bool = bool(parameter_list[2])
        parameter_skip_gradient_calculation: bool = bool(parameter_list[3])
        parameter_output_layer: bool = bool(parameter_list[9])
        parameter_local_learning: bool = bool(parameter_list[10])
        # ##############################################
        # Dealing with overall scale of the gradient
        # ##############################################
        # if parameter_disable_scale_grade is False:
        #     if parameter_keep_last_grad_scale is True:
        #         last_grad_scale = torch.tensor(
        #             [torch.abs(grad_output).max(), last_grad_scale]
        #         ).max()
        #     grad_output /= last_grad_scale
        # grad_output_scale = last_grad_scale.clone()
        input /= input.sum(dim=1, keepdim=True, dtype=weights.dtype)
        # #################################################
        # User doesn't want us to calculate the gradients
        # #################################################
        if parameter_skip_gradient_calculation is True:
            return (
                grad_input,
                grad_eps_xy,
                grad_epsilon_t_0,
                grad_weights,
                grad_h_initial,
                grad_parameter_list,
                # grad_output_scale,
                grad_forgetting_offset,
            )
        # #################################################
        # Calculate backprop error (grad_input)
        # #################################################
        backprop_r: torch.Tensor = weights.unsqueeze(0).unsqueeze(-1).unsqueeze(
            -1
        ) * output.unsqueeze(1)
        backprop_bigr: torch.Tensor = backprop_r.sum(dim=2)
        backprop_z: torch.Tensor = backprop_r * (
            1.0 / (backprop_bigr + 1e-20)
        ).unsqueeze(2)
        grad_input: torch.Tensor = (backprop_z * grad_output.unsqueeze(1)).sum(2)
        del backprop_z
        # #################################################
        # Calculate weight gradient (grad_weights)
        # #################################################
        if parameter_w_trainable is False:
            # #################################################
            # We don't train this weight
            # #################################################
            grad_weights = None
        elif (parameter_output_layer is False) and (parameter_local_learning is True):
            # #################################################
            # Local learning
            # #################################################
            grad_weights = (
                (-2 * (input - backprop_bigr).unsqueeze(2) * output.unsqueeze(1))
                .sum(0)
                .sum(-1)
                .sum(-1)
            )
        else:
            # #################################################
            # Backprop
            # #################################################
            backprop_f: torch.Tensor = output.unsqueeze(1) * (
                input / (backprop_bigr**2 + 1e-20)
            ).unsqueeze(2)
            result_omega: torch.Tensor = backprop_bigr.unsqueeze(
                2
            ) * grad_output.unsqueeze(1)
            result_omega -= (backprop_r * grad_output.unsqueeze(1)).sum(2).unsqueeze(2)
            result_omega *= backprop_f
            del backprop_f
            grad_weights = result_omega.sum(0).sum(-1).sum(-1)
            del result_omega
        del backprop_bigr
        del backprop_r
        return (
            grad_input,
            grad_eps_xy,
            grad_epsilon_t_0,
            grad_weights,
            grad_h_initial,
            grad_parameter_list,
            # grad_output_scale,
            grad_forgetting_offset,
        )
--- a/network/SbSLRScheduler.py
+++ b/network/SbSLRScheduler.py
@ -0,0 +1,106 @@
 import torch
 class SbSLRScheduler(torch.optim.lr_scheduler.ReduceLROnPlateau):
    def __init__(
        self,
        optimizer,
        mode: str = "min",
        factor: float = 0.1,
        patience: int = 10,
        threshold: float = 1e-4,
        threshold_mode: str = "rel",
        cooldown: int = 0,
        min_lr: float = 0,
        eps: float = 1e-8,
        verbose: bool = False,
        tau: float = 10,
    ) -> None:
        super().__init__(
            optimizer=optimizer,
            mode=mode,
            factor=factor,
            patience=patience,
            threshold=threshold,
            threshold_mode=threshold_mode,
            cooldown=cooldown,
            min_lr=min_lr,
            eps=eps,
            verbose=verbose,
        )
        self.lowpass_tau: float = tau
        self.lowpass_decay_value: float = 1.0 - (1.0 / self.lowpass_tau)
        self.lowpass_number_of_steps: int = 0
        self.loss_maximum_over_time: float | None = None
        self.lowpass_memory: float = 0.0
        self.lowpass_learning_rate_minimum_over_time: float | None = None
        self.lowpass_learning_rate_minimum_over_time_past_step: float | None = None
        self.previous_learning_rate: float | None = None
        self.loss_normalized_past_step: float | None = None
    def step(self, metrics, epoch=None) -> None:
        loss = float(metrics)
        if self.loss_maximum_over_time is None:
            self.loss_maximum_over_time = loss
        if self.loss_normalized_past_step is None:
            self.loss_normalized_past_step = loss / self.loss_maximum_over_time
        if self.previous_learning_rate is None:
            self.previous_learning_rate = self.optimizer.param_groups[-1]["lr"]  # type: ignore
        # The parent lr scheduler controlls the basic learn rate
        self.previous_learning_rate = self.optimizer.param_groups[-1]["lr"]  # type: ignore
        super().step(metrics=self.loss_normalized_past_step, epoch=epoch)
        # If the parent changes the base learning rate,
        # then we reset the adaptive part
        if self.optimizer.param_groups[-1]["lr"] != self.previous_learning_rate:  # type: ignore
            self.previous_learning_rate = self.optimizer.param_groups[-1]["lr"]  # type: ignore
            self.lowpass_number_of_steps = 0
            self.loss_maximum_over_time = None
            self.lowpass_memory = 0.0
            self.lowpass_learning_rate_minimum_over_time = None
            self.lowpass_learning_rate_minimum_over_time_past_step = None
        if self.loss_maximum_over_time is None:
            self.loss_maximum_over_time = loss
        else:
            self.loss_maximum_over_time = max(self.loss_maximum_over_time, loss)
        self.lowpass_number_of_steps += 1
        self.lowpass_memory = self.lowpass_memory * self.lowpass_decay_value + (
            loss / self.loss_maximum_over_time
        ) * (1.0 / self.lowpass_tau)
        loss_normalized: float = self.lowpass_memory / (
            1.0 - self.lowpass_decay_value ** float(self.lowpass_number_of_steps)
        )
        if self.lowpass_learning_rate_minimum_over_time is None:
            self.lowpass_learning_rate_minimum_over_time = loss_normalized
        else:
            self.lowpass_learning_rate_minimum_over_time = min(
                self.lowpass_learning_rate_minimum_over_time, loss_normalized
            )
        if self.lowpass_learning_rate_minimum_over_time_past_step is None:
            self.lowpass_learning_rate_minimum_over_time_past_step = (
                self.lowpass_learning_rate_minimum_over_time
            )
        self.optimizer.param_groups[-1]["lr"] *= (  # type: ignore
            self.lowpass_learning_rate_minimum_over_time
            / self.lowpass_learning_rate_minimum_over_time_past_step
        )
        self.lowpass_learning_rate_minimum_over_time_past_step = (
            self.lowpass_learning_rate_minimum_over_time
        )
        self.loss_normalized_past_step = loss_normalized
--- a/network/SplitOnOffLayer.py
+++ b/network/SplitOnOffLayer.py
@ -0,0 +1,54 @@
 import torch
 class SplitOnOffLayer(torch.nn.Module):
    device: torch.device
    default_dtype: torch.dtype
    mean: torch.Tensor | None = None
    epsilon: float = 0.01
    def __init__(
        self,
        device: torch.device | None = None,
        default_dtype: torch.dtype | None = None,
    ) -> None:
        super().__init__()
        assert device is not None
        assert default_dtype is not None
        self.device = device
        self.default_dtype = default_dtype
    ####################################################################
    # Forward                                                          #
    ####################################################################
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.ndim == 4
        # self.training is switched by network.eval() and network.train()
        if self.training is True:
            mean_temp = (
                input.mean(dim=0, keepdim=True)
                .mean(dim=1, keepdim=True)
                .detach()
                .clone()
            )
            if self.mean is None:
                self.mean = mean_temp
            else:
                self.mean = (1.0 - self.epsilon) * self.mean + self.epsilon * mean_temp
        assert self.mean is not None
        temp = input - self.mean.detach().clone()
        temp_a = torch.nn.functional.relu(temp)
        temp_b = torch.nn.functional.relu(-temp)
        output = torch.cat((temp_a, temp_b), dim=1)
        output /= output.sum(dim=1, keepdim=True) + 1e-20
        return output
--- a/network/build_datasets.py
+++ b/network/build_datasets.py
@ -0,0 +1,68 @@
 # %%
 import torch
 from network.Dataset import (
    DatasetMaster,
    DatasetCIFAR,
    DatasetMNIST,
    DatasetFashionMNIST,
 )
 from network.Parameter import Config
 def build_datasets(
    cfg: Config,
 ) -> tuple[
    DatasetMaster,
    DatasetMaster,
    torch.utils.data.DataLoader,
    torch.utils.data.DataLoader,
 ]:
    # Load the input data
    the_dataset_train: DatasetMaster
    the_dataset_test: DatasetMaster
    if cfg.data_mode == "CIFAR10":
        the_dataset_train = DatasetCIFAR(
            train=True, path_pattern=cfg.data_path, path_label=cfg.data_path
        )
        the_dataset_test = DatasetCIFAR(
            train=False, path_pattern=cfg.data_path, path_label=cfg.data_path
        )
    elif cfg.data_mode == "MNIST":
        the_dataset_train = DatasetMNIST(
            train=True, path_pattern=cfg.data_path, path_label=cfg.data_path
        )
        the_dataset_test = DatasetMNIST(
            train=False, path_pattern=cfg.data_path, path_label=cfg.data_path
        )
    elif cfg.data_mode == "MNIST_FASHION":
        the_dataset_train = DatasetFashionMNIST(
            train=True, path_pattern=cfg.data_path, path_label=cfg.data_path
        )
        the_dataset_test = DatasetFashionMNIST(
            train=False, path_pattern=cfg.data_path, path_label=cfg.data_path
        )
    else:
        raise Exception("data_mode unknown")
    if len(cfg.image_statistics.mean) == 0:
        cfg.image_statistics.mean = the_dataset_train.mean
    # The basic size
    cfg.image_statistics.the_size = [
        the_dataset_train.pattern_storage.shape[2],
        the_dataset_train.pattern_storage.shape[3],
    ]
    # Minus the stuff we cut away in the pattern filter
    cfg.image_statistics.the_size[0] -= 2 * cfg.augmentation.crop_width_in_pixel
    cfg.image_statistics.the_size[1] -= 2 * cfg.augmentation.crop_width_in_pixel
    my_loader_test: torch.utils.data.DataLoader = torch.utils.data.DataLoader(
        the_dataset_test, batch_size=cfg.batch_size, shuffle=False
    )
    my_loader_train: torch.utils.data.DataLoader = torch.utils.data.DataLoader(
        the_dataset_train, batch_size=cfg.batch_size, shuffle=True
    )
    return the_dataset_train, the_dataset_test, my_loader_test, my_loader_train
--- a/network/build_lr_scheduler.py
+++ b/network/build_lr_scheduler.py
@ -0,0 +1,86 @@
 # %%
 import torch
 from network.Parameter import Config
 try:
    from network.SbSLRScheduler import SbSLRScheduler
    sbs_lr_scheduler: bool = True
 except Exception:
    sbs_lr_scheduler = False
 def build_lr_scheduler(
    optimizer, cfg: Config, logging
 ) -> list[torch.optim.lr_scheduler.ReduceLROnPlateau | SbSLRScheduler | None]:
    assert len(optimizer) > 0
    lr_scheduler_list: list[
        torch.optim.lr_scheduler.ReduceLROnPlateau | SbSLRScheduler | None
    ] = []
    for id_optimizer in range(0, len(optimizer)):
        if cfg.learning_parameters.lr_schedule_name == "None":
            logging.info(f"Using lr scheduler for optimizer {id_optimizer} : None")
            lr_scheduler_list.append(None)
        elif cfg.learning_parameters.lr_schedule_name == "ReduceLROnPlateau":
            logging.info(
                f"Using lr scheduler for optimizer {id_optimizer}: ReduceLROnPlateau"
            )
            if optimizer[id_optimizer] is None:
                lr_scheduler_list.append(None)
            elif (cfg.learning_parameters.lr_scheduler_factor_w <= 0) or (
                cfg.learning_parameters.lr_scheduler_patience_w <= 0
            ):
                lr_scheduler_list.append(
                    torch.optim.lr_scheduler.ReduceLROnPlateau(
                        optimizer[id_optimizer],eps=1e-14,
                    )
                )
            else:
                lr_scheduler_list.append(
                    torch.optim.lr_scheduler.ReduceLROnPlateau(
                        optimizer[id_optimizer],
                        factor=cfg.learning_parameters.lr_scheduler_factor_w,
                        patience=cfg.learning_parameters.lr_scheduler_patience_w,
                        eps=1e-14,
                    )
                )
        elif cfg.learning_parameters.lr_schedule_name == "SbSLRScheduler":
            logging.info(
                f"Using lr scheduler for optimizer {id_optimizer}: SbSLRScheduler"
            )
            if sbs_lr_scheduler is False:
                raise Exception(
                    f"lr_scheduler for optimizer {id_optimizer}: SbSLRScheduler.py missing"
                )
            if optimizer[id_optimizer] is None:
                lr_scheduler_list.append(None)
            elif (
                (cfg.learning_parameters.lr_scheduler_factor_w <= 0)
                or (cfg.learning_parameters.lr_scheduler_patience_w <= 0)
                or (cfg.learning_parameters.lr_scheduler_tau_w <= 0)
            ):
                lr_scheduler_list.append(None)
            else:
                lr_scheduler_list.append(
                    SbSLRScheduler(
                        optimizer[id_optimizer],
                        factor=cfg.learning_parameters.lr_scheduler_factor_w,
                        patience=cfg.learning_parameters.lr_scheduler_patience_w,
                        tau=cfg.learning_parameters.lr_scheduler_tau_w,
                    )
                )
        else:
            raise Exception("lr_scheduler not implemented")
    return lr_scheduler_list
--- a/network/build_network.py
+++ b/network/build_network.py
@ -0,0 +1,354 @@
 # %%
 import torch
 from network.calculate_output_size import calculate_output_size
 from network.Parameter import Config
 from network.SbS import SbS
 from network.SplitOnOffLayer import SplitOnOffLayer
 from network.Conv2dApproximation import Conv2dApproximation
 def build_network(
    cfg: Config, device: torch.device, default_dtype: torch.dtype, logging
 ) -> torch.nn.Sequential:
    network = torch.nn.Sequential()
    input_size: list[list[int]] = []
    input_size.append(cfg.image_statistics.the_size)
    for layer_id in range(0, len(cfg.network_structure.layer_type)):
        # #############################################################
        # Show infos about the layer:
        # #############################################################
        logging.info("")
        logging.info(f"Layer ID: {layer_id}")
        logging.info(f"Layer type: {cfg.network_structure.layer_type[layer_id]}")
        # #############################################################
        # Fill in the default values
        # #############################################################
        kernel_size: list[int] = [1, 1]
        if len(cfg.network_structure.forward_kernel_size) > layer_id:
            kernel_size = cfg.network_structure.forward_kernel_size[layer_id]
        padding: list[int] = [0, 0]
        if len(cfg.network_structure.padding) > layer_id:
            padding = cfg.network_structure.padding[layer_id]
        dilation: list[int] = [1, 1]
        if len(cfg.network_structure.dilation) > layer_id:
            dilation = cfg.network_structure.dilation[layer_id]
        strides: list[int] = [1, 1]
        if len(cfg.network_structure.strides) > layer_id:
            if len(cfg.network_structure.strides[layer_id]) == 2:
                strides = cfg.network_structure.strides[layer_id]
        in_channels: int = -1
        out_channels: int = -1
        if len(cfg.network_structure.forward_neuron_numbers) > layer_id:
            if len(cfg.network_structure.forward_neuron_numbers[layer_id]) == 2:
                in_channels = cfg.network_structure.forward_neuron_numbers[layer_id][0]
                out_channels = cfg.network_structure.forward_neuron_numbers[layer_id][1]
        weight_noise_range: list[float] = [1.0, 1.1]
        if len(cfg.learning_parameters.weight_noise_range) == 2:
            weight_noise_range = [
                float(cfg.learning_parameters.weight_noise_range[0]),
                float(cfg.learning_parameters.weight_noise_range[1]),
            ]
        logging.info(f"Input channels: {in_channels}")
        logging.info(f"Output channels: {out_channels}")
        logging.info(f"Kernel size: {kernel_size}")
        logging.info(f"Stride: {strides}")
        logging.info(f"Dilation: {dilation}")
        logging.info(f"Padding: {padding}")
        # Conv2D
        bias: bool = True
        # Approx settings
        approximation_enable: bool = False
        if len(cfg.approximation_setting.approximation_enable) > layer_id:
            approximation_enable = cfg.approximation_setting.approximation_enable[
                layer_id
            ]
            logging.info(f"Approximation Enable: {approximation_enable}")
        elif len(cfg.approximation_setting.approximation_enable) == 1:
            approximation_enable = cfg.approximation_setting.approximation_enable[0]
            logging.info(f"Approximation Enable: {approximation_enable}")
        number_of_trunc_bits: int = -1
        if len(cfg.approximation_setting.number_of_trunc_bits) > layer_id:
            number_of_trunc_bits = cfg.approximation_setting.number_of_trunc_bits[
                layer_id
            ]
            logging.info(f"Number of trunc bits: {number_of_trunc_bits}")
        elif len(cfg.approximation_setting.number_of_trunc_bits) == 1:
            number_of_trunc_bits = cfg.approximation_setting.number_of_trunc_bits[0]
            logging.info(f"Number of trunc bits: {number_of_trunc_bits}")
        number_of_frac_bits: int = -1
        if len(cfg.approximation_setting.number_of_frac_bits) > layer_id:
            number_of_frac_bits = cfg.approximation_setting.number_of_frac_bits[
                layer_id
            ]
            logging.info(f"Number of frac bits: {number_of_trunc_bits}")
        elif len(cfg.approximation_setting.number_of_frac_bits) == 1:
            number_of_frac_bits = cfg.approximation_setting.number_of_frac_bits[0]
            logging.info(f"Number of frac bits: {number_of_trunc_bits}")
        # Weights: Trainable?
        w_trainable: bool = False
        if len(cfg.learning_parameters.w_trainable) > layer_id:
            w_trainable = cfg.learning_parameters.w_trainable[layer_id]
        elif len(cfg.learning_parameters.w_trainable) == 1:
            w_trainable = cfg.learning_parameters.w_trainable[0]
        logging.info(f"W trainable?: {w_trainable}")
        # SbS Setting
        sbs_skip_gradient_calculation: bool = False
        if len(cfg.learning_parameters.sbs_skip_gradient_calculation) > layer_id:
            sbs_skip_gradient_calculation = (
                cfg.learning_parameters.sbs_skip_gradient_calculation[layer_id]
            )
        elif len(cfg.learning_parameters.sbs_skip_gradient_calculation) == 1:
            sbs_skip_gradient_calculation = (
                cfg.learning_parameters.sbs_skip_gradient_calculation[0]
            )
        # #############################################################
        # SbS layer:
        # #############################################################
        if cfg.network_structure.layer_type[layer_id].upper().startswith("SBS") is True:
            assert in_channels > 0
            assert out_channels > 0
            number_of_spikes: int = -1
            if len(cfg.number_of_spikes) > layer_id:
                number_of_spikes = cfg.number_of_spikes[layer_id]
            elif len(cfg.number_of_spikes) == 1:
                number_of_spikes = cfg.number_of_spikes[0]
            assert number_of_spikes > 0
            logging.info(
                f"Layer: {layer_id} -> SbS Layer with {number_of_spikes} spikes"
            )
            is_pooling_layer: bool = False
            if cfg.network_structure.layer_type[layer_id].upper().find("POOLING") != -1:
                is_pooling_layer = True
            network.append(
                SbS(
                    number_of_input_neurons=in_channels,
                    number_of_neurons=out_channels,
                    input_size=input_size[-1],
                    forward_kernel_size=kernel_size,
                    number_of_spikes=number_of_spikes,
                    epsilon_t=cfg.get_epsilon_t(number_of_spikes),
                    epsilon_xy_intitial=cfg.learning_parameters.eps_xy_intitial,
                    epsilon_0=cfg.epsilon_0,
                    weight_noise_range=weight_noise_range,
                    is_pooling_layer=is_pooling_layer,
                    strides=strides,
                    dilation=dilation,
                    padding=padding,
                    number_of_cpu_processes=cfg.number_of_cpu_processes,
                    w_trainable=w_trainable,
                    # keep_last_grad_scale=cfg.learning_parameters.kepp_last_grad_scale,
                    # disable_scale_grade=cfg.learning_parameters.disable_scale_grade,
                    forgetting_offset=cfg.forgetting_offset,
                    skip_gradient_calculation=sbs_skip_gradient_calculation,
                    device=device,
                    default_dtype=default_dtype,
                )
            )
            # Adding the x,y output dimensions
            input_size.append(network[-1]._output_size.tolist())
            network[-1]._output_layer = False
            if layer_id == len(cfg.network_structure.layer_type) - 1:
                network[-1]._output_layer = True
            network[-1]._local_learning = False
            if cfg.network_structure.layer_type[layer_id].upper().find("LOCAL") != -1:
                network[-1]._local_learning = True
        # #############################################################
        # Split On Off Layer:
        # #############################################################
        elif (
            cfg.network_structure.layer_type[layer_id].upper().startswith("ONOFF")
            is True
        ):
            logging.info(f"Layer: {layer_id} -> Split On Off Layer")
            network.append(
                SplitOnOffLayer(
                    device=device,
                    default_dtype=default_dtype,
                )
            )
            input_size.append(input_size[-1])
        # #############################################################
        # PyTorch CONV2D layer:
        # #############################################################
        elif (
            cfg.network_structure.layer_type[layer_id].upper().startswith("CONV2D")
            is True
        ):
            assert in_channels > 0
            assert out_channels > 0
            logging.info(f"Layer: {layer_id} -> CONV2D Layer")
            network.append(
                torch.nn.Conv2d(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
                    stride=(int(strides[0]), int(strides[1])),
                    dilation=(int(dilation[0]), int(dilation[1])),
                    bias=bias,
                    padding=(int(padding[0]), int(padding[1])),
                    device=device,
                    dtype=default_dtype,
                )
            )
            # I need this later...
            network[-1]._w_trainable = w_trainable
            # Calculate the x,y output dimensions
            input_size_temp = calculate_output_size(
                value=input_size[-1],
                kernel_size=kernel_size,
                stride=strides,
                dilation=dilation,
                padding=padding,
            ).tolist()
            input_size.append(input_size_temp)
        # #############################################################
        # PyTorch RELU layer:
        # #############################################################
        elif (
            cfg.network_structure.layer_type[layer_id].upper().startswith("RELU")
            is True
        ):
            logging.info(f"Layer: {layer_id} -> RELU Layer")
            network.append(torch.nn.ReLU())
            input_size.append(input_size[-1])
        # #############################################################
        # PyTorch MAX Pooling layer:
        # #############################################################
        elif (
            cfg.network_structure.layer_type[layer_id].upper().startswith("MAX POOLING")
            is True
        ):
            logging.info(f"Layer: {layer_id} -> MAX POOLING Layer")
            network.append(
                torch.nn.MaxPool2d(
                    kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
                    stride=(int(strides[0]), int(strides[1])),
                    padding=(int(padding[0]), int(padding[1])),
                    dilation=(int(dilation[0]), int(dilation[1])),
                )
            )
            # Calculate the x,y output dimensions
            input_size_temp = calculate_output_size(
                value=input_size[-1],
                kernel_size=kernel_size,
                stride=strides,
                dilation=dilation,
                padding=padding,
            ).tolist()
            input_size.append(input_size_temp)
        # #############################################################
        # PyTorch Average Pooling layer:
        # #############################################################
        elif (
            cfg.network_structure.layer_type[layer_id]
            .upper()
            .startswith("AVERAGE POOLING")
            is True
        ):
            logging.info(f"Layer: {layer_id} -> AVERAGE POOLING Layer")
            network.append(
                torch.nn.AvgPool2d(
                    kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
                    stride=(int(strides[0]), int(strides[1])),
                    padding=(int(padding[0]), int(padding[1])),
                )
            )
            # Calculate the x,y output dimensions
            input_size_temp = calculate_output_size(
                value=input_size[-1],
                kernel_size=kernel_size,
                stride=strides,
                dilation=dilation,
                padding=padding,
            ).tolist()
            input_size.append(input_size_temp)
        # #############################################################
        # Approx CONV2D layer:
        # #############################################################
        elif (
            cfg.network_structure.layer_type[layer_id]
            .upper()
            .startswith("APPROX CONV2D")
            is True
        ):
            assert in_channels > 0
            assert out_channels > 0
            logging.info(f"Layer: {layer_id} -> Approximation CONV2D Layer")
            network.append(
                Conv2dApproximation(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
                    stride=(int(strides[0]), int(strides[1])),
                    dilation=(int(dilation[0]), int(dilation[1])),
                    bias=bias,
                    padding=(int(padding[0]), int(padding[1])),
                    device=device,
                    dtype=default_dtype,
                    approximation_enable=approximation_enable,
                    number_of_trunc_bits=number_of_trunc_bits,
                    number_of_frac=number_of_frac_bits,
                    number_of_processes=cfg.number_of_cpu_processes,
                )
            )
            # I need this later...
            network[-1]._w_trainable = w_trainable
            # Calculate the x,y output dimensions
            input_size_temp = calculate_output_size(
                value=input_size[-1],
                kernel_size=kernel_size,
                stride=strides,
                dilation=dilation,
                padding=padding,
            ).tolist()
            input_size.append(input_size_temp)
        # #############################################################
        # Failure becaue we didn't found the selection of layer
        # #############################################################
        else:
            raise Exception(
                f"Unknown layer type: {cfg.network_structure.layer_type[layer_id]}"
            )
    return network
--- a/network/build_optimizer.py
+++ b/network/build_optimizer.py
@ -0,0 +1,83 @@
 # %%
 import torch
 from network.Parameter import Config
 from network.SbS import SbS
 from network.Conv2dApproximation import Conv2dApproximation
 from network.Adam import Adam
 def build_optimizer(
    network: torch.nn.Sequential, cfg: Config, logging
 ) -> list[torch.optim.Optimizer | None]:
    parameter_list_weights: list = []
    parameter_list_sbs: list = []
    # ###############################################
    # Put all parameter that needs to be learned
    # in a parameter list.
    # ###############################################
    for id in range(0, len(network)):
        if (isinstance(network[id], SbS) is True) and (
            network[id]._w_trainable is True
        ):
            parameter_list_weights.append(network[id]._weights)
            parameter_list_sbs.append(True)
        if (isinstance(network[id], torch.nn.modules.conv.Conv2d) is True) and (
            network[id]._w_trainable is True
        ):
            for id_parameter in network[id].parameters():
                parameter_list_weights.append(id_parameter)
                parameter_list_sbs.append(False)
        if (isinstance(network[id], Conv2dApproximation) is True) and (
            network[id]._w_trainable is True
        ):
            for id_parameter in network[id].parameters():
                parameter_list_weights.append(id_parameter)
                parameter_list_sbs.append(False)
    logging.info(
        f"Number of parameters found to optimize: {len(parameter_list_weights)}"
    )
    # ###############################################
    # Connect the parameters to an optimizer
    # ###############################################
    if cfg.learning_parameters.optimizer_name == "Adam":
        logging.info("Using optimizer: Adam")
        if len(parameter_list_weights) == 0:
            optimizer_wf: torch.optim.Optimizer | None = None
        elif cfg.learning_parameters.learning_rate_gamma_w > 0:
            optimizer_wf = Adam(
                parameter_list_weights,
                parameter_list_sbs,
                lr=cfg.learning_parameters.learning_rate_gamma_w,
            )
        else:
            optimizer_wf = Adam(parameter_list_weights, parameter_list_sbs)
    elif cfg.learning_parameters.optimizer_name == "SGD":
        logging.info("Using optimizer: SGD")
        if len(parameter_list_weights) == 0:
            optimizer_wf = None
        elif cfg.learning_parameters.learning_rate_gamma_w > 0:
            optimizer_wf = torch.optim.SGD(
                parameter_list_weights,
                lr=cfg.learning_parameters.learning_rate_gamma_w,
            )
        else:
            assert cfg.learning_parameters.learning_rate_gamma_w > 0
    else:
        raise Exception("Optimizer not implemented")
    optimizer = []
    optimizer.append(optimizer_wf)
    return optimizer
--- a/network/calculate_output_size.py
+++ b/network/calculate_output_size.py
@ -0,0 +1,95 @@
 # %%
 import torch
 def calculate_output_size(
    value: list[int],
    kernel_size: list[int],
    stride: list[int],
    dilation: list[int],
    padding: list[int],
 ) -> torch.Tensor:
    assert len(value) == 2
    assert len(kernel_size) == 2
    assert len(stride) == 2
    assert len(dilation) == 2
    assert len(padding) == 2
    coordinates_0, coordinates_1 = get_coordinates(
        value=value,
        kernel_size=kernel_size,
        stride=stride,
        dilation=dilation,
        padding=padding,
    )
    output_size: torch.Tensor = torch.tensor(
        [
            coordinates_0.shape[1],
            coordinates_1.shape[1],
        ],
        dtype=torch.int64,
    )
    return output_size
 def get_coordinates(
    value: list[int],
    kernel_size: list[int],
    stride: list[int],
    dilation: list[int],
    padding: list[int],
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Function converts parameter in coordinates
    for the convolution window"""
    unfold_0: torch.nn.Unfold = torch.nn.Unfold(
        kernel_size=(int(kernel_size[0]), 1),
        dilation=int(dilation[0]),
        padding=int(padding[0]),
        stride=int(stride[0]),
    )
    unfold_1: torch.nn.Unfold = torch.nn.Unfold(
        kernel_size=(1, int(kernel_size[1])),
        dilation=int(dilation[1]),
        padding=int(padding[1]),
        stride=int(stride[1]),
    )
    coordinates_0: torch.Tensor = (
        unfold_0(
            torch.unsqueeze(
                torch.unsqueeze(
                    torch.unsqueeze(
                        torch.arange(0, int(value[0]), dtype=torch.float32),
                        1,
                    ),
                    0,
                ),
                0,
            )
        )
        .squeeze(0)
        .type(torch.int64)
    )
    coordinates_1: torch.Tensor = (
        unfold_1(
            torch.unsqueeze(
                torch.unsqueeze(
                    torch.unsqueeze(
                        torch.arange(0, int(value[1]), dtype=torch.float32),
                        0,
                    ),
                    0,
                ),
                0,
            )
        )
        .squeeze(0)
        .type(torch.int64)
    )
    return coordinates_0, coordinates_1
--- a/network/dataset_collection/DATA_CIFAR10/convert.py
+++ b/network/dataset_collection/DATA_CIFAR10/convert.py
@ -0,0 +1,126 @@
 # MIT License
 # Copyright 2022 University of Bremen
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included
 # in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 # THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 #
 # David Rotermund ( davrot@uni-bremen.de )
 #
 #
 # Release history:
 # ================
 # 1.0.0 -- 01.05.2022: first release
 #
 #
 import numpy as np
 import pickle
 def give_filenames(id: int) -> tuple[str, str, int]:
    if id == 0:
        start_id: int = 0
        prefix: str = "Test"
        filename: str = "cifar-10-batches-py/test_batch"
    if id == 1:
        start_id = 0
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_1"
    if id == 2:
        start_id = 10000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_2"
    if id == 3:
        start_id = 20000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_3"
    if id == 4:
        start_id = 30000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_4"
    if id == 5:
        start_id = 40000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_5"
    return filename, prefix, start_id
 def load_data(filename: str) -> tuple[np.ndarray, np.ndarray]:
    fo = open(filename, "rb")
    dict_data = pickle.load(fo, encoding="bytes")
    _, labels_temp, data_temp, _ = dict_data.items()
    data: np.ndarray = np.array(data_temp[1])
    labels: np.ndarray = np.array(labels_temp[1])
    return data, labels
 def split_into_three_color_channels(
    image: np.ndarray,
 ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    channel_r = image[0:1024].astype(np.float32)
    channel_r = channel_r.reshape(32, 32)
    channel_g = image[1024:2048].astype(np.float32)
    channel_g = channel_g.reshape(32, 32)
    channel_b = image[2048:3072].astype(np.float32)
    channel_b = channel_b.reshape(32, 32)
    return channel_r, channel_g, channel_b
 def process_data_set(test_data_mode: bool) -> None:
    if test_data_mode is True:
        filename_out_pattern: str = "TestPatternStorage.npy"
        filename_out_label: str = "TestLabelStorage.npy"
        number_of_pictures: int = 10000
        start_id: int = 0
        end_id: int = 0
    else:
        filename_out_pattern = "TrainPatternStorage.npy"
        filename_out_label = "TrainLabelStorage.npy"
        number_of_pictures = 50000
        start_id = 1
        end_id = 5
    np_data: np.ndarray = np.zeros((number_of_pictures, 32, 32, 3), dtype=np.float32)
    np_label: np.ndarray = np.zeros((number_of_pictures), dtype=np.uint64)
    for id in range(start_id, end_id + 1):
        filename, _, start_id_pattern = give_filenames(id)
        pictures, labels = load_data(filename)
        for i in range(0, pictures.shape[0]):
            channel_r, channel_g, channel_b = split_into_three_color_channels(
                pictures[i, :]
            )
            np_data[i + start_id_pattern, :, :, 0] = channel_r
            np_data[i + start_id_pattern, :, :, 1] = channel_g
            np_data[i + start_id_pattern, :, :, 2] = channel_b
            np_label[i + start_id_pattern] = labels[i]
    np_data /= np.max(np_data)
    label_storage: np.ndarray = np_label.astype(dtype=np.uint64)
    pattern_storage: np.ndarray = np_data.astype(dtype=np.float32)
    np.save(filename_out_pattern, pattern_storage)
    np.save(filename_out_label, label_storage)
 process_data_set(True)
 process_data_set(False)
--- a/network/dataset_collection/DATA_CIFAR10/data_url.txt
+++ b/network/dataset_collection/DATA_CIFAR10/data_url.txt
@ -0,0 +1,8 @@
 https://www.cs.toronto.edu/~kriz/cifar.html
 Download the CIFAR-10 python version
 https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
 Then
 tar -xvzf cifar-10-python.tar.gz
 python convert.py
--- a/network/dataset_collection/DATA_CIFAR10/dataset.json
+++ b/network/dataset_collection/DATA_CIFAR10/dataset.json
@ -0,0 +1,4 @@
 {
    "data_path": "./DATA_CIFAR10/",
    "data_mode": "CIFAR10"
 }
--- a/network/dataset_collection/DATA_FASHION_MNIST/convert.py
+++ b/network/dataset_collection/DATA_FASHION_MNIST/convert.py
@ -0,0 +1,161 @@
 # MIT License
 # Copyright 2022 University of Bremen
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included
 # in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 # THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 #
 # David Rotermund ( davrot@uni-bremen.de )
 #
 #
 # Release history:
 # ================
 # 1.0.0 -- 01.05.2022: first release
 #
 #
 import numpy as np
 # [offset] [type]          [value]          [description]
 # 0000     32 bit integer  0x00000801(2049) magic number (MSB first)
 # 0004     32 bit integer  60000            number of items
 # 0008     unsigned byte   ??               label
 # 0009     unsigned byte   ??               label
 # ........
 # xxxx     unsigned byte   ??               label
 # The labels values are 0 to 9.
 class ReadLabel:
    """Class for reading the labels from an MNIST label file"""
    def __init__(self, filename):
        self.filename: str = filename
        self.data = self.read_from_file(filename)
    def read_from_file(self, filename):
        int32_data = np.dtype(np.uint32)
        int32_data = int32_data.newbyteorder(">")
        file = open(filename, "rb")
        magic_flag = np.frombuffer(file.read(4), int32_data)[0]
        if magic_flag != 2049:
            data = np.zeros(0)
            number_of_elements = 0
        else:
            number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_elements < 1:
            data = np.zeros(0)
        else:
            data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8)
        file.close()
        return data
 # [offset] [type]          [value]          [description]
 # 0000     32 bit integer  0x00000803(2051) magic number
 # 0004     32 bit integer  60000            number of images
 # 0008     32 bit integer  28               number of rows
 # 0012     32 bit integer  28               number of columns
 # 0016     unsigned byte   ??               pixel
 # 0017     unsigned byte   ??               pixel
 # ........
 # xxxx     unsigned byte   ??               pixel
 # Pixels are organized row-wise.
 # Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
 class ReadPicture:
    """Class for reading the images from an MNIST image file"""
    def __init__(self, filename):
        self.filename: str = filename
        self.data = self.read_from_file(filename)
    def read_from_file(self, filename):
        int32_data = np.dtype(np.uint32)
        int32_data = int32_data.newbyteorder(">")
        file = open(filename, "rb")
        magic_flag = np.frombuffer(file.read(4), int32_data)[0]
        if magic_flag != 2051:
            data = np.zeros(0)
            number_of_elements = 0
        else:
            number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_elements < 1:
            data = np.zeros(0)
            number_of_rows = 0
        else:
            number_of_rows = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_rows != 28:
            data = np.zeros(0)
            number_of_columns = 0
        else:
            number_of_columns = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_columns != 28:
            data = np.zeros(0)
        else:
            data = np.frombuffer(
                file.read(number_of_elements * number_of_rows * number_of_columns),
                dtype=np.uint8,
            )
            data = data.reshape(number_of_elements, number_of_columns, number_of_rows)
        file.close()
        return data
 def proprocess_data_set(test_mode):
    if test_mode is True:
        filename_out_pattern: str = "TestPatternStorage.npy"
        filename_out_label: str = "TestLabelStorage.npy"
        filename_in_image: str = "t10k-images-idx3-ubyte"
        filename_in_label = "t10k-labels-idx1-ubyte"
    else:
        filename_out_pattern = "TrainPatternStorage.npy"
        filename_out_label = "TrainLabelStorage.npy"
        filename_in_image = "train-images-idx3-ubyte"
        filename_in_label = "train-labels-idx1-ubyte"
    pictures = ReadPicture(filename_in_image)
    labels = ReadLabel(filename_in_label)
    # Down to 0 ... 1.0
    max_value = np.max(pictures.data.astype(np.float32))
    d = np.float32(pictures.data.astype(np.float32) / max_value)
    label_storage = np.uint64(labels.data)
    pattern_storage = d.astype(np.float32)
    np.save(filename_out_pattern, pattern_storage)
    np.save(filename_out_label, label_storage)
 proprocess_data_set(True)
 proprocess_data_set(False)
--- a/network/dataset_collection/DATA_FASHION_MNIST/data_url.txt
+++ b/network/dataset_collection/DATA_FASHION_MNIST/data_url.txt
@ -0,0 +1,8 @@
 https://github.com/zalandoresearch/fashion-mnist
 We need:
 t10k-images-idx3-ubyte.gz  t10k-labels-idx1-ubyte.gz  train-images-idx3-ubyte.gz  train-labels-idx1-ubyte.gz
 Then
 gzip -d *.gz 
 python convert.py
--- a/network/dataset_collection/DATA_FASHION_MNIST/dataset.json
+++ b/network/dataset_collection/DATA_FASHION_MNIST/dataset.json
@ -0,0 +1,4 @@
 {
    "data_path": "./DATA_FASHION_MNIST/",
    "data_mode": "MNIST_FASHION"
 }
--- a/network/dataset_collection/DATA_MNIST/convert.py
+++ b/network/dataset_collection/DATA_MNIST/convert.py
@ -0,0 +1,161 @@
 # MIT License
 # Copyright 2022 University of Bremen
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the "Software"),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included
 # in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
 # THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 #
 # David Rotermund ( davrot@uni-bremen.de )
 #
 #
 # Release history:
 # ================
 # 1.0.0 -- 01.05.2022: first release
 #
 #
 import numpy as np
 # [offset] [type]          [value]          [description]
 # 0000     32 bit integer  0x00000801(2049) magic number (MSB first)
 # 0004     32 bit integer  60000            number of items
 # 0008     unsigned byte   ??               label
 # 0009     unsigned byte   ??               label
 # ........
 # xxxx     unsigned byte   ??               label
 # The labels values are 0 to 9.
 class ReadLabel:
    """Class for reading the labels from an MNIST label file"""
    def __init__(self, filename):
        self.filename: str = filename
        self.data = self.read_from_file(filename)
    def read_from_file(self, filename):
        int32_data = np.dtype(np.uint32)
        int32_data = int32_data.newbyteorder(">")
        file = open(filename, "rb")
        magic_flag = np.frombuffer(file.read(4), int32_data)[0]
        if magic_flag != 2049:
            data = np.zeros(0)
            number_of_elements = 0
        else:
            number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_elements < 1:
            data = np.zeros(0)
        else:
            data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8)
        file.close()
        return data
 # [offset] [type]          [value]          [description]
 # 0000     32 bit integer  0x00000803(2051) magic number
 # 0004     32 bit integer  60000            number of images
 # 0008     32 bit integer  28               number of rows
 # 0012     32 bit integer  28               number of columns
 # 0016     unsigned byte   ??               pixel
 # 0017     unsigned byte   ??               pixel
 # ........
 # xxxx     unsigned byte   ??               pixel
 # Pixels are organized row-wise.
 # Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
 class ReadPicture:
    """Class for reading the images from an MNIST image file"""
    def __init__(self, filename):
        self.filename: str = filename
        self.data = self.read_from_file(filename)
    def read_from_file(self, filename):
        int32_data = np.dtype(np.uint32)
        int32_data = int32_data.newbyteorder(">")
        file = open(filename, "rb")
        magic_flag = np.frombuffer(file.read(4), int32_data)[0]
        if magic_flag != 2051:
            data = np.zeros(0)
            number_of_elements = 0
        else:
            number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_elements < 1:
            data = np.zeros(0)
            number_of_rows = 0
        else:
            number_of_rows = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_rows != 28:
            data = np.zeros(0)
            number_of_columns = 0
        else:
            number_of_columns = np.frombuffer(file.read(4), int32_data)[0]
        if number_of_columns != 28:
            data = np.zeros(0)
        else:
            data = np.frombuffer(
                file.read(number_of_elements * number_of_rows * number_of_columns),
                dtype=np.uint8,
            )
            data = data.reshape(number_of_elements, number_of_columns, number_of_rows)
        file.close()
        return data
 def proprocess_data_set(test_mode):
    if test_mode is True:
        filename_out_pattern: str = "TestPatternStorage.npy"
        filename_out_label: str = "TestLabelStorage.npy"
        filename_in_image: str = "t10k-images-idx3-ubyte"
        filename_in_label = "t10k-labels-idx1-ubyte"
    else:
        filename_out_pattern = "TrainPatternStorage.npy"
        filename_out_label = "TrainLabelStorage.npy"
        filename_in_image = "train-images-idx3-ubyte"
        filename_in_label = "train-labels-idx1-ubyte"
    pictures = ReadPicture(filename_in_image)
    labels = ReadLabel(filename_in_label)
    # Down to 0 ... 1.0
    max_value = np.max(pictures.data.astype(np.float32))
    d = np.float32(pictures.data.astype(np.float32) / max_value)
    label_storage = np.uint64(labels.data)
    pattern_storage = d.astype(np.float32)
    np.save(filename_out_pattern, pattern_storage)
    np.save(filename_out_label, label_storage)
 proprocess_data_set(True)
 proprocess_data_set(False)
--- a/network/dataset_collection/DATA_MNIST/data_url.txt
+++ b/network/dataset_collection/DATA_MNIST/data_url.txt
@ -0,0 +1,8 @@
 http://yann.lecun.com/exdb/mnist/
 We need:
 t10k-images-idx3-ubyte.gz  t10k-labels-idx1-ubyte.gz  train-images-idx3-ubyte.gz  train-labels-idx1-ubyte.gz
 Then
 gzip -d *.gz 
 python convert.py
--- a/network/dataset_collection/DATA_MNIST/dataset.json
+++ b/network/dataset_collection/DATA_MNIST/dataset.json
@ -0,0 +1,4 @@
 {
    "data_path": "./DATA_MNIST/",
    "data_mode": "MNIST"
 }
--- a/network/load_previous_weights.py
+++ b/network/load_previous_weights.py
@ -0,0 +1,144 @@
 # %%
 import torch
 import glob
 import numpy as np
 from network.SbS import SbS
 from network.SplitOnOffLayer import SplitOnOffLayer
 from network.Conv2dApproximation import Conv2dApproximation
 def load_previous_weights(
    network: torch.nn.Sequential,
    overload_path: str,
    logging,
    device: torch.device,
    default_dtype: torch.dtype,
 ) -> None:
    for id in range(0, len(network)):
        # #################################################
        # SbS
        # #################################################
        if isinstance(network[id], SbS) is True:
            # Are there weights that overwrite the initial weights?
            file_to_load = glob.glob(overload_path + "/Weight_L" + str(id) + "_*.npy")
            if len(file_to_load) > 1:
                raise Exception(
                    f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
                )
            if len(file_to_load) == 1:
                network[id].weights = torch.tensor(
                    np.load(file_to_load[0]),
                    dtype=default_dtype,
                    device=device,
                )
                logging.info(f"Weights file used for layer {id} : {file_to_load[0]}")
        if isinstance(network[id], torch.nn.modules.conv.Conv2d) is True:
            # #################################################
            # Conv2d weights
            # #################################################
            # Are there weights that overwrite the initial weights?
            file_to_load = glob.glob(overload_path + "/Weight_L" + str(id) + "_*.npy")
            if len(file_to_load) > 1:
                raise Exception(
                    f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
                )
            if len(file_to_load) == 1:
                network[id]._parameters["weight"].data = torch.tensor(
                    np.load(file_to_load[0]),
                    dtype=default_dtype,
                    device=device,
                )
                logging.info(f"Weights file used for layer {id} : {file_to_load[0]}")
            # #################################################
            # Conv2d bias
            # #################################################
            # Are there biases that overwrite the initial weights?
            file_to_load = glob.glob(overload_path + "/Bias_L" + str(id) + "_*.npy")
            if len(file_to_load) > 1:
                raise Exception(
                    f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
                )
            if len(file_to_load) == 1:
                network[id]._parameters["bias"].data = torch.tensor(
                    np.load(file_to_load[0]),
                    dtype=default_dtype,
                    device=device,
                )
                logging.info(f"Bias file used for layer {id} : {file_to_load[0]}")
        if isinstance(network[id], Conv2dApproximation) is True:
            # #################################################
            # Approximate Conv2d weights
            # #################################################
            # Are there weights that overwrite the initial weights?
            file_to_load = glob.glob(overload_path + "/Weight_L" + str(id) + "_*.npy")
            if len(file_to_load) > 1:
                raise Exception(
                    f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
                )
            if len(file_to_load) == 1:
                network[id].weights.data = torch.tensor(
                    np.load(file_to_load[0]),
                    dtype=default_dtype,
                    device=device,
                )
                logging.info(f"Weights file used for layer {id} : {file_to_load[0]}")
            # #################################################
            # Approximate Conv2d bias
            # #################################################
            # Are there biases that overwrite the initial weights?
            file_to_load = glob.glob(overload_path + "/Bias_L" + str(id) + "_*.npy")
            if len(file_to_load) > 1:
                raise Exception(
                    f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
                )
            if len(file_to_load) == 1:
                network[id].bias.data = torch.tensor(
                    np.load(file_to_load[0]),
                    dtype=default_dtype,
                    device=device,
                )
                logging.info(f"Bias file used for layer {id} : {file_to_load[0]}")
        # #################################################
        # SplitOnOffLayer
        # #################################################
        if isinstance(network[id], SplitOnOffLayer) is True:
            # Are there weights that overwrite the initial weights?
            file_to_load = glob.glob(overload_path + "/Mean_L" + str(id) + "_*.npy")
            if len(file_to_load) > 1:
                raise Exception(
                    f"Too many previous mean files {overload_path}/Mean_L{id}*.npy"
                )
            if len(file_to_load) == 1:
                network[id].mean = torch.tensor(
                    np.load(file_to_load[0]),
                    dtype=default_dtype,
                    device=device,
                )
                logging.info(f"Meanfile used for layer {id} : {file_to_load[0]}")
--- a/network/loop_train_test.py
+++ b/network/loop_train_test.py
@ -0,0 +1,512 @@
 import torch
 import time
 from network.Parameter import Config
 from torch.utils.tensorboard import SummaryWriter
 from network.SbS import SbS
 from network.save_weight_and_bias import save_weight_and_bias
 def add_weight_and_bias_to_histogram(
    network: torch.nn.modules.container.Sequential,
    tb: SummaryWriter,
    iteration_number: int,
 ) -> None:
    for id in range(0, len(network)):
        # ################################################
        # Log the SbS Weights
        # ################################################
        if isinstance(network[id], SbS) is True:
            if network[id]._w_trainable is True:
                try:
                    tb.add_histogram(
                        f"Weights Layer {id}",
                        network[id].weights,
                        iteration_number,
                    )
                except ValueError:
                    pass
        # ################################################
        # Log the Conv2 Weights and Biases
        # ################################################
        if isinstance(network[id], torch.nn.modules.conv.Conv2d) is True:
            if network[id]._w_trainable is True:
                try:
                    tb.add_histogram(
                        f"Weights Layer {id}",
                        network[id]._parameters["weight"].data,
                        iteration_number,
                    )
                except ValueError:
                    pass
                try:
                    tb.add_histogram(
                        f"Bias Layer {id}",
                        network[id]._parameters["bias"].data,
                        iteration_number,
                    )
                except ValueError:
                    pass
    tb.flush()
 # loss_mode == 0: "normal" SbS loss function mixture
 # loss_mode == 1: cross_entropy
 def loss_function(
    h: torch.Tensor,
    labels: torch.Tensor,
    device: torch.device,
    default_dtype: torch.dtype,
    loss_mode: int = 0,
    number_of_output_neurons: int = 10,
    loss_coeffs_mse: float = 0.0,
    loss_coeffs_kldiv: float = 0.0,
 ) -> torch.Tensor | None:
    assert loss_mode >= 0
    assert loss_mode <= 1
    h = h.squeeze(-1).squeeze(-1)
    assert h.ndim == 2
    if loss_mode == 0:
        # Convert label into one hot
        target_one_hot: torch.Tensor = torch.zeros(
            (
                labels.shape[0],
                number_of_output_neurons,
            ),
            device=device,
            dtype=default_dtype,
        )
        target_one_hot.scatter_(
            1,
            labels.to(device).unsqueeze(1),
            torch.ones(
                (labels.shape[0], 1),
                device=device,
                dtype=default_dtype,
            ),
        ).unsqueeze(-1).unsqueeze(-1)
        h_y1 = torch.log(h + 1e-20)
        my_loss: torch.Tensor = (
            torch.nn.functional.mse_loss(
                h,
                target_one_hot,
                reduction="sum",
            )
            * loss_coeffs_mse
            + torch.nn.functional.kl_div(h_y1, target_one_hot + 1e-20, reduction="sum")
            * loss_coeffs_kldiv
        ) / (loss_coeffs_kldiv + loss_coeffs_mse)
        return my_loss
    elif loss_mode == 1:
        my_loss = torch.nn.functional.cross_entropy(
            h.squeeze(-1).squeeze(-1), labels.to(device)
        )
        return my_loss
    else:
        return None
 def forward_pass_train(
    input: torch.Tensor,
    labels: torch.Tensor,
    the_dataset_train,
    cfg: Config,
    network: torch.nn.modules.container.Sequential,
    device: torch.device,
    default_dtype: torch.dtype,
 ) -> list[torch.Tensor]:
    h_collection = []
    h_collection.append(
        the_dataset_train.pattern_filter_train(input, cfg)
        .type(dtype=default_dtype)
        .to(device=device)
    )
    for id in range(0, len(network)):
        if isinstance(network[id], SbS) is True:
            h_collection.append(network[id](h_collection[-1], labels))
        else:
            h_collection.append(network[id](h_collection[-1]))
    return h_collection
 def forward_pass_test(
    input: torch.Tensor,
    the_dataset_test,
    cfg: Config,
    network: torch.nn.modules.container.Sequential,
    device: torch.device,
    default_dtype: torch.dtype,
 ) -> list[torch.Tensor]:
    h_collection = []
    h_collection.append(
        the_dataset_test.pattern_filter_test(input, cfg)
        .type(dtype=default_dtype)
        .to(device=device)
    )
    for id in range(0, len(network)):
        h_collection.append(network[id](h_collection[-1]))
    return h_collection
 def run_optimizer(
    network: torch.nn.modules.container.Sequential,
    optimizer: list,
    cfg: Config,
 ) -> None:
    for id in range(0, len(network)):
        if isinstance(network[id], SbS) is True:
            network[id].update_pre_care()
    for optimizer_item in optimizer:
        if optimizer_item is not None:
            optimizer_item.step()
    for id in range(0, len(network)):
        if isinstance(network[id], SbS) is True:
            network[id].update_after_care(
                cfg.learning_parameters.learning_rate_threshold_w
                / float(
                    network[id]._number_of_input_neurons
                    # * network[id]._kernel_size[0]
                    # * network[id]._kernel_size[1]
                ),
            )
 # ####################################
 # Update the learning rate
 # ####################################
 def run_lr_scheduler(
    cfg: Config,
    lr_scheduler,
    optimizer,
    performance_for_batch: float,
    my_loss_for_batch: float,
    tb,
    logging,
 ) -> None:
    # Inter-epoch learning rate adaptation
    for lr_scheduler_item in lr_scheduler:
        if (
            (lr_scheduler_item is not None)
            and (performance_for_batch >= 0.0)
            and (my_loss_for_batch >= 0.0)
        ):
            if cfg.learning_parameters.lr_scheduler_use_performance is True:
                lr_scheduler_item.step(100.0 - performance_for_batch)
            else:
                lr_scheduler_item.step(my_loss_for_batch)
            tb.add_scalar(
                "Train Error",
                100.0 - performance_for_batch,
                cfg.epoch_id,
            )
            tb.add_scalar("Train Loss", my_loss_for_batch, cfg.epoch_id)
            tb.add_scalar(
                "Learning Rate Scale WF",
                optimizer[0].param_groups[-1]["lr"],
                cfg.epoch_id,
            )
            tb.flush()
 # def deal_with_gradient_scale(epoch_id: int, mini_batch_number: int, network):
 #     if (epoch_id == 0) and (mini_batch_number == 0):
 #         for id in range(0, len(network)):
 #             if isinstance(network[id], SbS) is True:
 #                 network[id].after_batch(True)
 #     else:
 #         for id in range(0, len(network)):
 #             if isinstance(network[id], SbS) is True:
 #                 network[id].after_batch()
 def loop_train(
    cfg: Config,
    network: torch.nn.modules.container.Sequential,
    my_loader_train: torch.utils.data.dataloader.DataLoader,
    the_dataset_train,
    optimizer: list,
    device: torch.device,
    default_dtype: torch.dtype,
    logging,
    adapt_learning_rate: bool,
    tb: SummaryWriter,
    lr_scheduler,
    last_test_performance: float,
 ) -> tuple[float, float, float, float]:
    correct_in_minibatch: int = 0
    loss_in_minibatch: float = 0.0
    number_of_pattern_in_minibatch: int = 0
    mini_batch_number: int = -1
    full_loss: float = 0.0
    full_correct: float = 0.0
    full_count: float = 0.0
    epoch_id: int = cfg.epoch_id
    my_loss_for_batch: float = -1.0
    performance_for_batch: float = -1.0
    time_forward: float = 0.0
    time_backward: float = 0.0
    with torch.enable_grad():
        for h_x, h_x_labels in my_loader_train:
            time_mini_batch_start: float = time.perf_counter()
            # ############################################################
            # Reset the gradient after an update (or the first loop pass)
            # ############################################################
            if number_of_pattern_in_minibatch == 0:
                # Reset the gradient of the torch optimizers
                for optimizer_item in optimizer:
                    if optimizer_item is not None:
                        optimizer_item.zero_grad()
                loss_in_minibatch = 0.0
                mini_batch_number += 1
                correct_in_minibatch = 0
                time_forward = 0.0
                time_backward = 0.0
                # ####################################
                # Update the learning rate
                # ####################################
                if adapt_learning_rate is True:
                    run_lr_scheduler(
                        cfg=cfg,
                        lr_scheduler=lr_scheduler,
                        optimizer=optimizer,
                        performance_for_batch=performance_for_batch,
                        my_loss_for_batch=my_loss_for_batch,
                        tb=tb,
                        logging=logging,
                    )
                logging.info(
                    (
                        f"\t\t\tLearning rate: "
                        f"weights:{optimizer[0].param_groups[-1]['lr']:^15.3e} "
                    )
                )
                if last_test_performance < 0:
                    logging.info("")
                else:
                    logging.info(
                        (
                            f"\t\t\tLast test performance: "
                            f"{last_test_performance/100.0:^6.2%}"
                        )
                    )
                logging.info("----------------")
            number_of_pattern_in_minibatch += h_x_labels.shape[0]
            full_count += h_x_labels.shape[0]
            # #####################################################
            # The network does the forward pass (training)
            # #####################################################
            h_collection = forward_pass_train(
                input=h_x,
                labels=h_x_labels,
                the_dataset_train=the_dataset_train,
                cfg=cfg,
                network=network,
                device=device,
                default_dtype=default_dtype,
            )
            # #####################################################
            # Calculate the loss function
            # #####################################################
            my_loss: torch.Tensor | None = loss_function(
                h=h_collection[-1],
                labels=h_x_labels,
                device=device,
                default_dtype=default_dtype,
                loss_mode=cfg.learning_parameters.loss_mode,
                number_of_output_neurons=int(
                    cfg.network_structure.number_of_output_neurons
                ),
                loss_coeffs_mse=float(cfg.learning_parameters.loss_coeffs_mse),
                loss_coeffs_kldiv=float(cfg.learning_parameters.loss_coeffs_kldiv),
            )
            assert my_loss is not None
            time_after_forward_and_loss: float = time.perf_counter()
            # #####################################################
            # Backward pass
            # #####################################################
            my_loss.backward()
            loss_in_minibatch += my_loss.item()
            full_loss += my_loss.item()
            time_after_backward: float = time.perf_counter()
            # #####################################################
            # Performance measures
            # #####################################################
            correct_in_minibatch += (
                (h_collection[-1].argmax(dim=1).squeeze().cpu() == h_x_labels)
                .sum()
                .item()
            )
            full_correct += (
                (h_collection[-1].argmax(dim=1).squeeze().cpu() == h_x_labels)
                .sum()
                .item()
            )
            # We measure the scale of the propagated error
            # during the first minibatch
            # then we remember this size and scale
            # the future error with it
            # Kind of deals with the vanishing /
            # exploding gradients
            # deal_with_gradient_scale(
            #     epoch_id=epoch_id,
            #     mini_batch_number=mini_batch_number,
            #     network=network,
            # )
            # Measure the time for one mini-batch
            time_forward += time_after_forward_and_loss - time_mini_batch_start
            time_backward += time_after_backward - time_after_forward_and_loss
            if number_of_pattern_in_minibatch >= cfg.get_update_after_x_pattern():
                logging.info(
                    (
                        f"{epoch_id:^6}=>{mini_batch_number:^6} "
                        f"\t\tTraining {number_of_pattern_in_minibatch^6} pattern "
                        f"with {correct_in_minibatch/number_of_pattern_in_minibatch:^6.2%} "
                        f"\tForward time: \t{time_forward:^6.2f}sec"
                    )
                )
                logging.info(
                    (
                        f"\t\t\tLoss: {loss_in_minibatch/number_of_pattern_in_minibatch:^15.3e} "
                        f"\t\t\tBackward time: \t{time_backward:^6.2f}sec "
                    )
                )
                my_loss_for_batch = loss_in_minibatch / number_of_pattern_in_minibatch
                performance_for_batch = (
                    100.0 * correct_in_minibatch / number_of_pattern_in_minibatch
                )
                # ################################################
                # Update the weights and biases
                # ################################################
                run_optimizer(network=network, optimizer=optimizer, cfg=cfg)
                # ################################################
                # Save the Weights and Biases
                # ################################################
                save_weight_and_bias(
                    cfg=cfg, network=network, iteration_number=epoch_id
                )
                # ################################################
                # Log the Weights and Biases
                # ################################################
                add_weight_and_bias_to_histogram(
                    network=network,
                    tb=tb,
                    iteration_number=epoch_id,
                )
                # ################################################
                # Mark mini batch as done
                # ################################################
                number_of_pattern_in_minibatch = 0
    return (
        my_loss_for_batch,
        performance_for_batch,
        (full_loss / full_count),
        (100.0 * full_correct / full_count),
    )
 def loop_test(
    epoch_id: int,
    cfg: Config,
    network: torch.nn.modules.container.Sequential,
    my_loader_test: torch.utils.data.dataloader.DataLoader,
    the_dataset_test,
    device: torch.device,
    default_dtype: torch.dtype,
    logging,
    tb: SummaryWriter,
 ) -> float:
    test_correct = 0
    test_count = 0
    test_complete: int = the_dataset_test.__len__()
    logging.info("")
    logging.info("Testing:")
    for h_x, h_x_labels in my_loader_test:
        time_0 = time.perf_counter()
        h_collection = forward_pass_test(
            input=h_x,
            the_dataset_test=the_dataset_test,
            cfg=cfg,
            network=network,
            device=device,
            default_dtype=default_dtype,
        )
        h_h: torch.Tensor = h_collection[-1].detach().clone().cpu()
        test_correct += (h_h.argmax(dim=1).squeeze() == h_x_labels).sum().numpy()
        test_count += h_h.shape[0]
        performance = 100.0 * test_correct / test_count
        time_1 = time.perf_counter()
        time_measure_a = time_1 - time_0
        logging.info(
            (
                f"\t\t{test_count} of {test_complete}"
                f" with {performance/100:^6.2%} \t Time used: {time_measure_a:^6.2f}sec"
            )
        )
    logging.info("")
    tb.add_scalar("Test Error", 100.0 - performance, epoch_id)
    tb.flush()
    return performance
--- a/network/save_weight_and_bias.py
+++ b/network/save_weight_and_bias.py
@ -0,0 +1,71 @@
 import torch
 from network.Parameter import Config
 import numpy as np
 from network.SbS import SbS
 from network.SplitOnOffLayer import SplitOnOffLayer
 from network.Conv2dApproximation import Conv2dApproximation
 def save_weight_and_bias(
    cfg: Config, network: torch.nn.modules.container.Sequential, iteration_number: int
 ) -> None:
    for id in range(0, len(network)):
        # ################################################
        # Save the SbS Weights
        # ################################################
        if isinstance(network[id], SbS) is True:
            if network[id]._w_trainable is True:
                np.save(
                    f"{cfg.weight_path}/Weight_L{id}_S{iteration_number}.npy",
                    network[id].weights.detach().cpu().numpy(),
                )
        # ################################################
        # Save the Conv2 Weights and Biases
        # ################################################
        if isinstance(network[id], torch.nn.modules.conv.Conv2d) is True:
            if network[id]._w_trainable is True:
                # Save the new values
                np.save(
                    f"{cfg.weight_path}/Weight_L{id}_S{iteration_number}.npy",
                    network[id]._parameters["weight"].data.detach().cpu().numpy(),
                )
                # Save the new values
                np.save(
                    f"{cfg.weight_path}/Bias_L{id}_S{iteration_number}.npy",
                    network[id]._parameters["bias"].data.detach().cpu().numpy(),
                )
        # ################################################
        # Save the Approximate Conv2 Weights and Biases
        # ################################################
        if isinstance(network[id], Conv2dApproximation) is True:
            if network[id]._w_trainable is True:
                # Save the new values
                np.save(
                    f"{cfg.weight_path}/Weight_L{id}_S{iteration_number}.npy",
                    network[id].weights.data.detach().cpu().numpy(),
                )
                # Save the new values
                if network[id].bias is not None:
                    np.save(
                        f"{cfg.weight_path}/Bias_L{id}_S{iteration_number}.npy",
                        network[id].bias.data.detach().cpu().numpy(),
                    )
        if isinstance(network[id], SplitOnOffLayer) is True:
            np.save(
                f"{cfg.weight_path}/Mean_L{id}_S{iteration_number}.npy",
                network[id].mean.detach().cpu().numpy(),
            )
--- a/network/test_settings_fashion/dataset.json
+++ b/network/test_settings_fashion/dataset.json
@ -0,0 +1,4 @@
 {
    "data_path": "./DATA_FASHION_MNIST/",
    "data_mode": "MNIST_FASHION"
 }
--- a/network/test_settings_fashion/def_approx_cnn.json
+++ b/network/test_settings_fashion/def_approx_cnn.json
@ -0,0 +1,29 @@
 {
    "epoch_id_max": 200,
    "batch_size": 24,
    "stage_id": 0,
    "simulation_id": 0,
    "learning_parameters": {
        "loss_mode": 1, // X-Entropy
        "number_of_batches_for_one_update": 20,
        "learning_rate_gamma_w": 0.001,
        "lr_scheduler_patience_w": -1,
        "adapt_learning_rate_after_minibatch": false,
        "w_trainable": [
            true
        ]
    },
    "augmentation": {},
    "image_statistics": {},
    "approximation_setting": {
        "number_of_frac_bits": [
            30
        ], // Quantization of the mantissa
        "approximation_enable": [
            true
        ], // In addition to the quantization more approx
        "number_of_trunc_bits": [
            1
        ]
    }
 }
--- a/network/test_settings_fashion/def_cnn.json
+++ b/network/test_settings_fashion/def_cnn.json
@ -0,0 +1,19 @@
 {
    "epoch_id_max": 200,
    "batch_size": 24,
    "stage_id": 0,
    "simulation_id": 0,
    "learning_parameters": {
        "loss_mode": 1, // X-Entropy
        "number_of_batches_for_one_update": 20,
        "learning_rate_gamma_w": 0.001,
        "lr_scheduler_patience_w": -1,
        "adapt_learning_rate_after_minibatch": false,
        "w_trainable": [
            true
        ]
    },
    "augmentation": {},
    "image_statistics": {},
    "approximation_setting": {}
 }
--- a/network/test_settings_fashion/network_approx_cnn.json
+++ b/network/test_settings_fashion/network_approx_cnn.json
@ -0,0 +1,97 @@
 {
    "network_structure": {
        "number_of_output_neurons": 10,
        "layer_type": [
            "APPROX CONV2D",
            "RELU",
            "MAX POOLING",
            "APPROX CONV2D",
            "RELU",
            "MAX POOLING",
            "APPROX CONV2D",
            "RELU",
            "APPROX CONV2D"
        ],
        "strides": [
            [
                1,
                1
            ], // "CONV2D"
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                1,
                1
            ], // "CONV2D"
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                1,
                1
            ], // "CONV2D"
            [], // "RELU",
            [
                1,
                1
            ] // "CONV2D"
        ],
        "forward_neuron_numbers": [
            [
                1,
                32
            ], // "CONV2D",
            [], // "RELU",
            [], // "MAX POOLING",
            [
                32,
                64
            ], // "CONV2D",
            [], // "RELU",
            [], // "MAX POOLING",
            [
                64,
                96
            ], // "CONV2D",
            [], // "RELU",
            [
                96,
                10
            ] // "CONV2D",            
        ],
        "forward_kernel_size": [
            [
                5,
                5
            ], // "CONV2D",
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                5,
                5
            ], // "CONV2D",
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                3,
                3
            ], // "CONV2D",
            [], // "RELU",
            [
                1,
                1
            ] // "CONV2D",            
        ]
    }
 }
--- a/network/test_settings_fashion/network_cnn.json
+++ b/network/test_settings_fashion/network_cnn.json
@ -0,0 +1,97 @@
 {
    "network_structure": {
        "number_of_output_neurons": 10,
        "layer_type": [
            "CONV2D",
            "RELU",
            "MAX POOLING",
            "CONV2D",
            "RELU",
            "MAX POOLING",
            "CONV2D",
            "RELU",
            "CONV2D"
        ],
        "strides": [
            [
                1,
                1
            ], // "CONV2D"
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                1,
                1
            ], // "CONV2D"
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                1,
                1
            ], // "CONV2D"
            [], // "RELU",
            [
                1,
                1
            ] // "CONV2D"
        ],
        "forward_neuron_numbers": [
            [
                1,
                32
            ], // "CONV2D",
            [], // "RELU",
            [], // "MAX POOLING",
            [
                32,
                64
            ], // "CONV2D",
            [], // "RELU",
            [], // "MAX POOLING",
            [
                64,
                96
            ], // "CONV2D",
            [], // "RELU",
            [
                96,
                10
            ] // "CONV2D",            
        ],
        "forward_kernel_size": [
            [
                5,
                5
            ], // "CONV2D",
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                5,
                5
            ], // "CONV2D",
            [], // "RELU",
            [
                2,
                2
            ], // "MAX POOLING",
            [
                3,
                3
            ], // "CONV2D",
            [], // "RELU",
            [
                1,
                1
            ] // "CONV2D",            
        ]
    }
 }
--- a/network/unused_code/LinearApproximation.py
+++ b/network/unused_code/LinearApproximation.py
@ -0,0 +1,186 @@
 import torch
 import math
 from network.CPP.PyMultiApp import MultiApp
 class LinearApproximation(torch.nn.Module):
    in_features: int | None = None
    out_features: int | None = None
    use_bias: bool = False
    approximation_enable: bool = False
    number_of_trunc_bits: int = -1
    number_of_frac: int = -1
    number_of_processes: int = 1
    weights: torch.nn.parameter.Parameter
    bias: torch.nn.parameter.Parameter | None
    device: torch.device
    dtype: torch.dtype
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        approximation_enable: bool = False,
        number_of_trunc_bits: int = -1,
        number_of_frac: int = -1,
        number_of_processes: int = 1,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
    ) -> None:
        super().__init__()
        assert device is not None
        self.device = device
        assert dtype is not None
        self.dtype = dtype
        self.in_features = in_features
        self.out_channels = out_features
        self.use_bias = bias
        self.approximation_enable = approximation_enable
        self.number_of_trunc_bits = number_of_trunc_bits
        self.number_of_frac = number_of_frac
        self.number_of_processes = number_of_processes
        if self.use_bias is True:
            self.bias: torch.nn.parameter.Parameter | None = (
                torch.nn.parameter.Parameter(
                    torch.empty(
                        (out_features),
                        dtype=self.dtype,
                        device=self.device,
                    )
                )
            )
        else:
            self.bias = None
        self.weights: torch.nn.parameter.Parameter = torch.nn.parameter.Parameter(
            torch.empty(
                (out_features, in_features),
                dtype=self.dtype,
                device=self.device,
            )
        )
        self.functional_multi = FunctionalMultiLinear.apply
        self.reset_parameters()
    def reset_parameters(self) -> None:
        # Stolen from original torch conv2 code
        torch.nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights)
            if fan_in != 0:
                bound = 1 / math.sqrt(fan_in)
                torch.nn.init.uniform_(self.bias, -bound, bound)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert input.dim() == 2
        parameter_list = torch.tensor(
            [
                int(self.approximation_enable),  # 0
                int(self.number_of_trunc_bits),  # 1
                int(self.number_of_frac),  # 2
                int(self.number_of_processes),  # 3
            ],
            dtype=torch.int64,
        )
        output = self.functional_multi(
            input.unsqueeze(-1).unsqueeze(-1), self.weights, parameter_list
        )
        output = output.squeeze(-1).squeeze(-1)
        if self.bias is not None:
            output += self.bias.unsqueeze(0)
        return output
 class FunctionalMultiLinear(torch.autograd.Function):
    @staticmethod
    def forward(  # type: ignore
        ctx,
        input: torch.Tensor,
        weights: torch.Tensor,
        parameter_list: torch.Tensor,
    ) -> torch.Tensor:
        assert input.ndim == 4
        assert input.dtype is torch.float32
        assert input.is_contiguous() is True
        assert weights.ndim == 2
        assert weights.dtype is torch.float32
        assert weights.is_contiguous() is True
        assert input.shape[1] == weights.shape[1]
        approximation_enable = bool(parameter_list[0])
        number_of_trunc_bits = int(parameter_list[1])
        number_of_frac = int(parameter_list[2])
        number_of_processes = int(parameter_list[3])
        assert input.device == weights.device
        output = torch.zeros(
            (input.shape[0], weights.shape[0], input.shape[2], input.shape[3]),
            dtype=weights.dtype,
            device=weights.device,
            requires_grad=True,
        )
        assert output.is_contiguous() is True
        multiplier: MultiApp = MultiApp()
        multiplier.update_with_init_vector_multi_pattern(
            input.data_ptr(),
            weights.data_ptr(),
            output.data_ptr(),
            int(output.shape[0]),  # pattern
            int(output.shape[1]),  # feature channel
            int(output.shape[2]),  # x
            int(output.shape[3]),  # y
            int(input.shape[1]),  # input channel
            int(number_of_processes),
            bool(approximation_enable),
            int(number_of_trunc_bits),
            int(number_of_frac),
        )
        ctx.save_for_backward(
            input.detach(),
            weights.detach(),
        )
        return output
    @staticmethod
    def backward(ctx, grad_output):
        (input, weights) = ctx.saved_tensors
        grad_input = (
            grad_output.unsqueeze(2) * weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
        ).sum(1)
        grad_weights = (
            (grad_output.unsqueeze(2) * input.unsqueeze(1)).sum(0).sum(-1).sum(-1)
        )
        grad_parameter_list = None
        return (grad_input, grad_weights, grad_parameter_list)
--- a/network_sbs_L0.json
+++ b/network_sbs_L0.json
@ -0,0 +1,39 @@
 {
    "network_structure": {
        "number_of_output_neurons": 10,
        "layer_type": [
            "SbS",
            "SbS"
        ],
        "strides": [
            [
                1,
                1
            ],
            [
                1,
                1
            ]
        ],
        "forward_neuron_numbers": [
            [
                1,
                32
            ],
            [
                32,
                10
            ]
        ],
        "forward_kernel_size": [
            [
                5,
                5
            ],
            [
                20,
                20
            ]
        ]
    }
 }
--- a/network_sbs_mp.json
+++ b/network_sbs_mp.json
@ -0,0 +1,91 @@
 {
    "network_structure": {
        "number_of_output_neurons": 10,
        "layer_type": [
            "SbS",
            "MAX POOLING",
            "SbS",
            "MAX POOLING",
            "SbS",
            "SbS"
        ],
        "strides": [
            [
                1,
                1
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                1,
                1
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                1,
                1
            ], // "SbS"
            [
                1,
                1
            ] // "SbS"
        ],
        "forward_neuron_numbers": [
            [
                1,
                32
            ], // "SbS"
            [
                32,
                32
            ], // POOLING
            [
                32,
                64
            ], // "SbS"
            [
                64,
                64
            ], // POOLING
            [
                64,
                96
            ], // "SbS"
            [
                96,
                10
            ] // "SbS"
        ],
        "forward_kernel_size": [
            [
                5,
                5
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                5,
                5
            ], // "SbS"
            [
                2,
                2
            ], // POOLING
            [
                3,
                3
            ], // "SbS"
            [
                1,
                1
            ] // "SbS"
        ]
    }
 }
--- a/run_spikes.sh
+++ b/run_spikes.sh
@ -0,0 +1,19 @@
 python=~/P3.10GPU/bin/python3
 if [ -f "./def.json" ]
 then
    echo "json file found"
 else
    echo "No json file exists"
    exit
 fi
 $python learn_it.py def.json
 status=$?
 if [ $status != "0" ]
 then
    echo "Problem detected (learn_it.py)!!!"
    exit 1
 fi