Add files via upload

2025-07-13 04:00:01 +02:00 · 2023-01-13 21:33:57 +01:00 · 2023-01-13 21:33:57 +01:00 · 09c8b10f23
commit 09c8b10f23
parent 6d6b757ed6
56 changed files with 5062 additions and 0 deletions
--- a/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.cu
+++ b/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.cu
@ -0,0 +1,967 @@
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+
+#include "HDynamicCNNManyIP.h"
+#include "approximation_multiplication_function.h"
+#include "kernel_approximation_multiplication.h"
+#include "kernel_phxy_fill_with_h.h"
+#include "kernel_phxy_fill_with_spike_selected_w.h"
+#include "kernel_phxy_one_over_sum_into_pxy.h"
+#include "kernel_phxy_plus_phxy.h"
+#include "kernel_phxy_plus_pxy.h"
+#include "kernel_phxy_times_phxy_equals_phxy.h"
+#include "kernel_phxy_times_pxy.h"
+#include "kernel_pxy_plus_v.h"
+#include "kernel_pxy_reciprocal.h"
+#include "kernel_pxy_set_to_v.h"
+#include "kernel_pxy_time_pxy.h"
+#include "kernel_pxy_times_spike_selected_sxy.h"
+#include "kernel_pxy_times_v.h"
+
+HDynamicCNNManyIP::HDynamicCNNManyIP(){
+
+};
+
+HDynamicCNNManyIP::~HDynamicCNNManyIP(){
+
+};
+
+bool HDynamicCNNManyIP::update_entrypoint(
+    int64_t h_pointer_addr, int64_t h_dim_0, int64_t h_dim_1, int64_t h_dim_2,
+    int64_t h_dim_3, int64_t epsilon_xy_pointer_addr, int64_t epsilon_xy_dim_0,
+    int64_t epsilon_xy_dim_1, int64_t epsilon_xy_dim_2,
+    int64_t epsilon_t_pointer_addr, int64_t epsilon_t_dim_0,
+    int64_t weights_pointer_addr, int64_t weights_dim_0, int64_t weights_dim_1,
+    int64_t input_pointer_addr, int64_t input_dim_0, int64_t input_dim_1,
+    int64_t input_dim_2, int64_t input_dim_3, int64_t init_vector_pointer_addr,
+    int64_t init_vector_dim_0, int64_t number_of_processes,
+    float forgetting_offset, int64_t gpu_tuning_factor
+    // ,bool approximation_multiplication_enable, uint64_t
+    // number_of_frac_bits, bool approximation_enable,
+    // uint64_t number_of_trunc_bits
+) {
+  bool approximation_multiplication_enable = false;
+  uint64_t number_of_frac_bits = 1;
+  bool approximation_enable = false;
+  uint64_t number_of_trunc_bits = false;
+
+  uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
+
+  size_t number_of_pattern = input_dim_0;
+
+  size_t h_dim = init_vector_dim_0;
+  float* h_init_ptr = (float*)init_vector_pointer_addr;
+  assert((h_init_ptr != nullptr));
+  assert((h_dim > 0));
+
+  float* h_pointer = (float*)h_pointer_addr;
+  assert((h_pointer != nullptr));
+  assert((h_dim_0 > 0));
+  assert((h_dim_1 > 0));
+  assert((h_dim_2 > 0));
+  assert((h_dim_3 > 0));
+
+  size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3;
+  size_t h_dim_c1 = h_dim_2 * h_dim_3;
+  size_t h_dim_c2 = h_dim_3;
+
+  float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr;
+  assert((epsilon_xy_pointer != nullptr));
+  assert((epsilon_xy_dim_0 > 0));
+  assert((epsilon_xy_dim_1 > 0));
+
+  size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1;
+  size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2;
+
+  float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr;
+  assert((epsilon_t_pointer != nullptr));
+  assert((epsilon_t_dim_0 > 0));
+
+  float* weights_pointer = (float*)weights_pointer_addr;
+  assert((weights_pointer != nullptr));
+  assert((weights_dim_0 > 0));
+  assert((weights_dim_1 > 0));
+
+  size_t weights_dim_c0 = weights_dim_1;
+
+  int64_t* input_pointer = (int64_t*)input_pointer_addr;
+  assert((input_pointer != nullptr));
+  assert((input_dim_0 > 0));
+  assert((input_dim_1 > 0));
+  assert((input_dim_2 > 0));
+  assert((input_dim_3 > 0));
+
+  size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
+  size_t input_dim_c1 = input_dim_2 * input_dim_3;
+  size_t input_dim_c2 = input_dim_3;
+
+  assert((h_dim == weights_dim_1));
+  size_t number_of_spikes = input_dim_1;
+  size_t dim_x = input_dim_2;
+  size_t dim_y = input_dim_3;
+
+  float forgetting_offset_local = forgetting_offset / static_cast<float>(h_dim);
+
+  // --------------------
+  if (number_of_processes > 0) {
+    omp_set_num_threads(number_of_processes);
+
+    size_t pattern_id;
+#pragma omp parallel for
+    for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) {
+      update(h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+             epsilon_xy_pointer, epsilon_xy_dim_c0, epsilon_xy_dim_c1,
+             epsilon_t_pointer, weights_pointer, weights_dim_c0, input_pointer,
+             input_dim_c0, input_dim_c1, input_dim_c2, number_of_spikes, dim_x,
+             dim_y, forgetting_offset, forgetting_offset_local, pattern_id,
+             approximation_multiplication_enable, number_of_frac_bits,
+             approximation_enable, number_of_trunc_bits, ap_mask);
+    }
+  } else {
+    gpu_update(h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+               epsilon_xy_pointer, epsilon_xy_dim_c0, epsilon_xy_dim_c1,
+               epsilon_t_pointer, weights_pointer, weights_dim_c0,
+               input_pointer, input_dim_c0, input_dim_c1, input_dim_c2,
+               number_of_spikes, dim_x, dim_y, forgetting_offset,
+               forgetting_offset_local, number_of_pattern, gpu_tuning_factor,
+               approximation_multiplication_enable, number_of_frac_bits,
+               approximation_enable, number_of_trunc_bits, ap_mask);
+  }
+  return true;
+};
+
+bool HDynamicCNNManyIP::update(
+    float* h_init_ptr, float* h_pointer, size_t h_dim_c0, size_t h_dim_c1,
+    size_t h_dim_c2, size_t h_dim, float* epsilon_xy_pointer,
+    size_t epsilon_xy_dim_c0, size_t epsilon_xy_dim_c1,
+    float* epsilon_t_pointer, float* weights_pointer, size_t weights_dim_c0,
+    int64_t* input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+    size_t input_dim_c2, size_t number_of_spikes, size_t dim_x, size_t dim_y,
+    float forgetting_offset, float forgetting_offset_local, size_t pattern_id,
+    bool approximation_multiplication_enable, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits,
+    uint32_t ap_mask) {
+  float* h_ptr;
+  float* epsilon_xy_ptr;
+  int64_t* input_ptr;
+
+  size_t counter_x;
+  size_t counter_y;
+
+  for (counter_x = 0; counter_x < dim_x; counter_x++) {
+    for (counter_y = 0; counter_y < dim_y; counter_y++) {
+      epsilon_xy_ptr =
+          epsilon_xy_pointer + counter_x * epsilon_xy_dim_c1 + counter_y;
+
+      h_ptr =
+          h_pointer + pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y;
+
+      input_ptr = input_pointer + pattern_id * input_dim_c0 +
+                  counter_x * input_dim_c2 + counter_y;
+
+      if (approximation_multiplication_enable == false) {
+        update_one_ip(h_init_ptr, h_ptr, h_dim_c1, h_dim, weights_pointer,
+                      weights_dim_c0, input_ptr, input_dim_c1, epsilon_xy_ptr,
+                      epsilon_xy_dim_c0, epsilon_t_pointer, number_of_spikes,
+                      forgetting_offset, forgetting_offset_local);
+      } else {
+        update_one_ip_approx(
+            h_init_ptr, h_ptr, h_dim_c1, h_dim, weights_pointer, weights_dim_c0,
+            input_ptr, input_dim_c1, epsilon_xy_ptr, epsilon_xy_dim_c0,
+            epsilon_t_pointer, number_of_spikes, forgetting_offset,
+            forgetting_offset_local, approximation_multiplication_enable,
+            number_of_frac_bits, approximation_enable, number_of_trunc_bits,
+            ap_mask);
+      }
+    }
+  }
+
+  return true;
+};
+
+void HDynamicCNNManyIP::update_one_ip_approx(
+    float* h_init_ptr, float* h_pointer, size_t h_dim_c1, size_t h_dim,
+    float* weights_pointer, size_t weights_dim_c0, int64_t* input_pointer,
+    size_t input_dim_c1, float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0,
+    float* epsilon_t_pointer, size_t number_of_spikes, float forgetting_offset,
+    float forgetting_offset_local, bool approximation_multiplication_enable,
+    uint64_t number_of_frac_bits, bool approximation_enable,
+    uint64_t number_of_trunc_bits, uint32_t ap_mask) {
+  float* h_temp = new float[h_dim];
+  float* h_subsegment = new float[h_dim];
+
+  memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim);
+
+  size_t counter_spike;
+  size_t counter;
+
+  float h_temp_sum;
+  float temp_value;
+
+  float epsilon_subsegment;
+  float epsilon_scale = 1.0;
+
+  int64_t* spike;
+  float* w_ptr;
+
+  // ---------------
+  // Approx...
+
+  uint64_t pattern_size = h_dim;
+
+  std::vector<float> ap_h_vector;
+  ap_h_vector.resize(pattern_size);
+  float* ap_h_ptr = ap_h_vector.data();
+
+  std::vector<uint32_t> ap_x_vector;
+  ap_x_vector.resize(pattern_size);
+  uint32_t* ap_x_ptr = ap_x_vector.data();
+
+  std::vector<uint32_t> ap_y_vector;
+  ap_y_vector.resize(pattern_size);
+  uint32_t* ap_y_ptr = ap_y_vector.data();
+
+  std::vector<uint32_t> ap_x_exponent_vector;
+  ap_x_exponent_vector.resize(pattern_size);
+  uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data();
+
+  std::vector<uint32_t> ap_y_exponent_vector;
+  ap_y_exponent_vector.resize(pattern_size);
+  uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data();
+
+  std::vector<uint32_t> ap_h_exponent_vector;
+  ap_h_exponent_vector.resize(pattern_size);
+  uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data();
+
+  std::vector<uint64_t> ap_res_vector;
+  ap_res_vector.resize(pattern_size);
+  uint64_t* ap_res_ptr = ap_res_vector.data();
+
+  std::vector<uint32_t> sign_temp_vector;
+  sign_temp_vector.resize(pattern_size);
+  uint32_t* sign_temp_ptr = sign_temp_vector.data();
+
+  // --------------
+
+  for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) {
+    if (epsilon_scale > 1E10) {
+      temp_value = 1.0 / epsilon_scale;
+
+#pragma omp simd
+      for (counter = 0; counter < h_dim; counter++) {
+        h_subsegment[counter] *= temp_value;
+      }
+
+      epsilon_scale = 1.0;
+    }
+
+    spike = input_pointer + counter_spike * input_dim_c1;
+
+    if (*spike >= 0) {
+      epsilon_subsegment = epsilon_xy_pointer[*spike * epsilon_xy_dim_c0] *
+                           epsilon_t_pointer[counter_spike];
+
+      w_ptr = weights_pointer + *spike * weights_dim_c0;
+
+      memcpy(h_temp, h_subsegment, sizeof(float) * h_dim);
+
+      approximation_multiplication_function(
+          ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits,
+          number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr,
+          ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr,
+          sign_temp_ptr, approximation_enable);
+      // --------------------------
+
+      h_temp_sum = 0.0;
+#pragma omp simd reduction(+ : h_temp_sum)
+      for (counter = 0; counter < h_dim; counter++) {
+        h_temp_sum += h_temp[counter];
+      }
+
+      if (h_temp_sum > 1E-10) {
+        temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
+
+#pragma omp simd
+        for (counter = 0; counter < h_dim; counter++) {
+          h_temp[counter] *= temp_value;
+        }
+
+#pragma omp simd
+        for (counter = 0; counter < h_dim; counter++) {
+          h_subsegment[counter] += h_temp[counter];
+        }
+
+        if (forgetting_offset_local > 0.0) {
+          temp_value =
+              epsilon_scale * epsilon_subsegment * forgetting_offset_local;
+
+#pragma omp simd
+          for (counter = 0; counter < h_dim; counter++) {
+            h_subsegment[counter] += temp_value;
+          }
+
+          epsilon_scale *= 1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
+        } else {
+          epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
+        }
+      }
+    }
+  }
+
+  temp_value = 1.0 / epsilon_scale;
+#pragma omp simd
+  for (counter = 0; counter < h_dim; counter++) {
+    h_pointer[counter * h_dim_c1] = h_subsegment[counter] * temp_value;
+  }
+
+  delete[] h_temp;
+  delete[] h_subsegment;
+
+  return;
+};
+
+void HDynamicCNNManyIP::update_one_ip(
+    float* h_init_ptr, float* h_pointer, size_t h_dim_c1, size_t h_dim,
+    float* weights_pointer, size_t weights_dim_c0, int64_t* input_pointer,
+    size_t input_dim_c1, float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0,
+    float* epsilon_t_pointer, size_t number_of_spikes, float forgetting_offset,
+    float forgetting_offset_local) {
+  float* h_temp = new float[h_dim];
+  float* h_subsegment = new float[h_dim];
+
+  memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim);
+
+  size_t counter_spike;
+  size_t counter;
+
+  float h_temp_sum;
+  float temp_value;
+
+  float epsilon_subsegment;
+  float epsilon_scale = 1.0;
+
+  int64_t* spike;
+  float* w_ptr;
+
+  // --------------
+
+  for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) {
+    if (epsilon_scale > 1E10) {
+      temp_value = 1.0 / epsilon_scale;
+
+#pragma omp simd
+      for (counter = 0; counter < h_dim; counter++) {
+        h_subsegment[counter] *= temp_value;
+      }
+
+      epsilon_scale = 1.0;
+    }
+
+    spike = input_pointer + counter_spike * input_dim_c1;
+
+    if (*spike >= 0) {
+      epsilon_subsegment = epsilon_xy_pointer[*spike * epsilon_xy_dim_c0] *
+                           epsilon_t_pointer[counter_spike];
+
+      w_ptr = weights_pointer + *spike * weights_dim_c0;
+
+      memcpy(h_temp, h_subsegment, sizeof(float) * h_dim);
+
+#pragma omp simd
+      for (counter = 0; counter < h_dim; counter++) {
+        h_temp[counter] *= w_ptr[counter];
+      }
+
+      h_temp_sum = 0.0;
+#pragma omp simd reduction(+ : h_temp_sum)
+      for (counter = 0; counter < h_dim; counter++) {
+        h_temp_sum += h_temp[counter];
+      }
+
+      if (h_temp_sum > 1E-10) {
+        temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
+
+#pragma omp simd
+        for (counter = 0; counter < h_dim; counter++) {
+          h_temp[counter] *= temp_value;
+        }
+
+#pragma omp simd
+        for (counter = 0; counter < h_dim; counter++) {
+          h_subsegment[counter] += h_temp[counter];
+        }
+
+        if (forgetting_offset_local > 0.0) {
+          temp_value =
+              epsilon_scale * epsilon_subsegment * forgetting_offset_local;
+
+#pragma omp simd
+          for (counter = 0; counter < h_dim; counter++) {
+            h_subsegment[counter] += temp_value;
+          }
+
+          epsilon_scale *= 1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
+        } else {
+          epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
+        }
+      }
+    }
+  }
+
+  temp_value = 1.0 / epsilon_scale;
+#pragma omp simd
+  for (counter = 0; counter < h_dim; counter++) {
+    h_pointer[counter * h_dim_c1] = h_subsegment[counter] * temp_value;
+  }
+
+  delete[] h_temp;
+  delete[] h_subsegment;
+
+  return;
+};
+
+// ------------------------------------------------
+
+void HDynamicCNNManyIP::gpu_occupancy_measure(size_t dim_x, size_t dim_y,
+                                              size_t number_of_pattern,
+                                              size_t h_dim) {
+  grid_and_thread_calculated = false;
+  assert((dim_x < 65535));
+  assert((dim_y < 65535));
+
+  grid_and_thread_settings.resize(14);
+
+  occupancy_kernel_phxy_plus_phxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY], display_debug);
+
+  occupancy_kernel_pxy_plus_v(dim_x, dim_y, number_of_pattern, h_dim,
+                              grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V],
+                              display_debug);
+
+  occupancy_kernel_pxy_times_v(dim_x, dim_y, number_of_pattern, h_dim,
+                               grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V],
+                               display_debug);
+
+  occupancy_kernel_phxy_fill_with_h(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H], display_debug);
+
+  occupancy_kernel_phxy_plus_pxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY], display_debug);
+
+  occupancy_kernel_pxy_reciprocal(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug);
+
+  occupancy_kernel_phxy_fill_with_spike_selected_w(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W],
+      display_debug);
+
+  occupancy_kernel_phxy_times_phxy_equals_phxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY],
+      display_debug);
+
+  occupancy_kernel_pxy_set_to_v(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V], display_debug);
+
+  occupancy_kernel_phxy_one_over_sum_into_pxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY],
+      display_debug);
+
+  occupancy_kernel_phxy_times_pxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY], display_debug);
+
+  occupancy_kernel_pxy_time_pxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY], display_debug);
+
+  occupancy_kernel_approximation_pure_multiplication(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION],
+      display_debug);
+
+  occupancy_kernel_pxy_times_spike_selected_sxy(
+      dim_x, dim_y, number_of_pattern, h_dim,
+      grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY],
+      display_debug);
+
+  grid_and_thread_calculated = true;
+  return;
+};
+
+void HDynamicCNNManyIP::gpu_occupancy_export(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1) {
+  int64_t* setting_memory = (int64_t*)setting_memory_addr;
+
+  assert((setting_memory != nullptr));
+  assert((setting_dim_1 == H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS));
+
+  gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim);
+  assert((grid_and_thread_calculated == true));
+
+  assert((setting_dim_0 == grid_and_thread_settings.size()));
+
+  for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) {
+    for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) {
+      setting_memory[counter_0 * setting_dim_1 + counter_1] =
+          grid_and_thread_settings[counter_0][counter_1];
+    }
+  }
+};
+
+void HDynamicCNNManyIP::gpu_occupancy_import(int64_t setting_memory_addr,
+                                             size_t setting_dim_0,
+                                             size_t setting_dim_1) {
+  grid_and_thread_calculated = false;
+
+  int64_t* setting_memory = (int64_t*)setting_memory_addr;
+
+  assert((setting_memory != nullptr));
+  assert((setting_dim_1 == H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS));
+  assert((setting_dim_0 == H_DYNAMIC_NUMBER_OF_KERNELS));
+
+  grid_and_thread_settings.resize(H_DYNAMIC_NUMBER_OF_KERNELS);
+
+  for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) {
+    grid_and_thread_settings[counter_0].resize(
+        H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS);
+
+    for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) {
+      grid_and_thread_settings[counter_0][counter_1] =
+          setting_memory[counter_0 * setting_dim_1 + counter_1];
+    }
+  }
+
+  grid_and_thread_calculated = true;
+};
+
+bool HDynamicCNNManyIP::gpu_update(
+    float* h_init_ptr, float* h_pointer, size_t h_dim_c0, size_t h_dim_c1,
+    size_t h_dim_c2, size_t h_dim, float* epsilon_xy_pointer,
+    size_t epsilon_xy_dim_c0, size_t epsilon_xy_dim_c1,
+    float* epsilon_t_pointer, float* weights_pointer, size_t weights_dim_c0,
+    int64_t* input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+    size_t input_dim_c2, size_t number_of_spikes, size_t dim_x, size_t dim_y,
+    float forgetting_offset, float forgetting_offset_local,
+    size_t number_of_pattern, size_t gpu_tuning_factor,
+    bool approximation_multiplication_enable, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits,
+    uint32_t ap_mask) {
+  if (grid_and_thread_calculated == false) {
+    gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim);
+  }
+  assert((grid_and_thread_calculated == true));
+
+  cudaError_t status;
+
+  size_t h_sum_dim_c0 = dim_x * dim_y;
+  size_t h_sum_dim_c1 = dim_y;
+
+  size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y;
+  size_t phxy_block_dim_c1 = dim_x * dim_y;
+  size_t phxy_block_dim_c2 = dim_y;
+
+  size_t pxy_block_dim_c0 = dim_x * dim_y;
+  size_t pxy_block_dim_c1 = dim_y;
+
+  float* w_memory = nullptr;
+  status = cudaMalloc((void**)&w_memory, number_of_pattern * h_dim * dim_x *
+                                             dim_y * sizeof(float));
+  assert((status == cudaSuccess));
+
+  float* h_temp_memory = nullptr;
+  status =
+      cudaMalloc((void**)&h_temp_memory,
+                 number_of_pattern * h_dim * dim_x * dim_y * sizeof(float));
+  assert((status == cudaSuccess));
+
+  float* h_sum_memory = nullptr;
+  status = cudaMalloc((void**)&h_sum_memory,
+                      number_of_pattern * dim_x * dim_y * sizeof(float));
+  assert((status == cudaSuccess));
+
+  float* epsilon_subsegment_memory = nullptr;
+  status = cudaMalloc((void**)&epsilon_subsegment_memory,
+                      number_of_pattern * dim_x * dim_y * sizeof(float));
+  assert((status == cudaSuccess));
+
+  float* epsilon_scale_memory = nullptr;
+  status = cudaMalloc((void**)&epsilon_scale_memory,
+                      number_of_pattern * dim_x * dim_y * sizeof(float));
+  assert((status == cudaSuccess));
+
+  float* forget_memory = nullptr;
+  status = cudaMalloc((void**)&forget_memory,
+                      number_of_pattern * dim_x * dim_y * sizeof(float));
+  assert((status == cudaSuccess));
+
+  // ---
+
+  // Initialize h
+  kernel_phxy_fill_with_h<<<
+      dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][0],
+           grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][1],
+           grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][2]),
+      dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][3],
+           grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][4],
+           grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][5])>>>(
+      h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+      phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2,
+      grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][6]);
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+
+  // Set epsilon memory scale to 1.0
+  kernel_pxy_set_to_v<<<
+      dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0],
+           grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1],
+           grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]),
+      dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3],
+           grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4],
+           grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>(
+      epsilon_scale_memory, 1.0,
+      grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]);
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+
+  for (size_t counter_spike = 0; counter_spike < number_of_spikes;
+       counter_spike++) {
+    // Get epsilon_t from gpu memory
+    float epsilon_t;
+    status = cudaMemcpy(&epsilon_t, &epsilon_t_pointer[counter_spike],
+                        sizeof(float), cudaMemcpyDeviceToHost);
+    assert((status == cudaSuccess));
+    // Set epsilon memory subsegment to epsilon(t)
+    kernel_pxy_set_to_v<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>(
+        epsilon_subsegment_memory, epsilon_t,
+        grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // Set epsilon memory subsegment to forgetting_offset_local
+    kernel_pxy_set_to_v<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>(
+        forget_memory, forgetting_offset_local,
+        grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    //     if (*spike >= 0) {
+    //       epsilon_subsegment = *epsilon_xy_pointer[*spike *
+    //       epsilon_xy_dim_c0]
+    kernel_pxy_times_spike_selected_sxy<<<
+        dim3(
+            grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][0],
+            grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][1],
+            grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY]
+                                    [2]),
+        dim3(
+            grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][3],
+            grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][4],
+            grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY]
+                                    [5])>>>(
+        epsilon_subsegment_memory, epsilon_xy_pointer, input_pointer,
+        counter_spike, input_dim_c0, input_dim_c1, input_dim_c2,
+        epsilon_xy_dim_c0, epsilon_xy_dim_c1, epsilon_xy_dim_c0,
+        epsilon_xy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1,
+        grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // Get the weight vectors according the spikes
+    kernel_phxy_fill_with_spike_selected_w<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W]
+                                     [0],
+             grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W]
+                                     [1],
+             grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W]
+                                     [2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W]
+                                     [3],
+             grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W]
+                                     [4],
+             grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W]
+                                     [5])>>>(
+        w_memory, weights_pointer, input_pointer, counter_spike, weights_dim_c0,
+        input_dim_c0, input_dim_c1, input_dim_c2, h_dim_c0, h_dim_c1, h_dim_c2,
+        h_dim, phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2,
+        grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // h_temp = h * w
+    if (approximation_multiplication_enable == false) {
+      kernel_phxy_times_phxy_equals_phxy<<<
+          dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY]
+                                       [0],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY]
+                                       [1],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY]
+                                       [2]),
+          dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY]
+                                       [3],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY]
+                                       [4],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY]
+                                       [5])>>>(
+          h_pointer, w_memory, h_temp_memory,
+          grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY][6]);
+
+    } else {
+      kernel_approximation_pure_multiplication<<<
+          dim3(grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION]
+                                       [0],
+               grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION]
+                                       [1],
+               grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION]
+                                       [2]),
+          dim3(grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION]
+                                       [3],
+               grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION]
+                                       [4],
+               grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION]
+                                       [5])>>>(
+          h_pointer, w_memory, h_temp_memory, number_of_frac_bits,
+          approximation_enable, number_of_trunc_bits, ap_mask,
+          grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION][6]);
+    }
+
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // 1 / sum h_temp
+    kernel_phxy_one_over_sum_into_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY]
+                                     [5])>>>(
+        h_temp_memory, h_sum_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+        h_sum_dim_c0, h_sum_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1,
+        grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // epsilon_scale / sum h_temp
+    kernel_pxy_time_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>(
+        h_sum_memory, epsilon_scale_memory,
+        grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // epsilon_subsegment * epsilon_scale / sum h_temp
+    kernel_pxy_time_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>(
+        h_sum_memory, epsilon_subsegment_memory,
+        grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // epsilon_scale * forget_memory which contains forgetting_offset_local
+    kernel_pxy_time_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>(
+        forget_memory, epsilon_scale_memory,
+        grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // delta_forget = epsilon_subsegment * epsilon_scale * forget_memory
+    kernel_pxy_time_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>(
+        forget_memory, epsilon_subsegment_memory,
+        grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // delta_h = h_temp_memory * epsilon_subsegment * epsilon_scale / sum h
+    kernel_phxy_times_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][5])>>>(
+        h_temp_memory, h_sum_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+        h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1,
+        phxy_block_dim_c2,
+        grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // h + delta_h
+    kernel_phxy_plus_phxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][0],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][1],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][3],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][4],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][5])>>>(
+        h_pointer, h_temp_memory,
+        grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // h + delta_h + delta_forget
+    kernel_phxy_plus_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][5])>>>(
+        h_pointer, forget_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+        h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1,
+        phxy_block_dim_c2,
+        grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    kernel_pxy_times_v<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][5])>>>(
+        epsilon_subsegment_memory, (1.0 + forgetting_offset),
+        grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    kernel_pxy_plus_v<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][5])>>>(
+        epsilon_subsegment_memory, 1.0,
+        grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    // epsilon_scale * epsilon_subsegment
+    kernel_pxy_time_pxy<<<
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]),
+        dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4],
+             grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>(
+        epsilon_scale_memory, epsilon_subsegment_memory,
+        grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]);
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+
+    if (((counter_spike > 0) && (counter_spike % 1000 == 0)) ||
+        (counter_spike + 1 == number_of_spikes)) {
+      kernel_pxy_reciprocal<<<
+          dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0],
+               grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1],
+               grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]),
+          dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3],
+               grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4],
+               grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>(
+          epsilon_scale_memory,
+          grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]);
+      status = cudaDeviceSynchronize();
+      assert((status == cudaSuccess));
+
+      kernel_phxy_times_pxy<<<
+          dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][0],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][1],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][2]),
+          dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][3],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][4],
+               grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][5])>>>(
+          h_pointer, epsilon_scale_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim,
+          h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1,
+          phxy_block_dim_c2,
+          grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][6]);
+      status = cudaDeviceSynchronize();
+      assert((status == cudaSuccess));
+
+      // Set epsilon memory scale to 1.0
+      kernel_pxy_set_to_v<<<
+          dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0],
+               grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1],
+               grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]),
+          dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3],
+               grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4],
+               grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>(
+          epsilon_scale_memory, 1.0,
+          grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]);
+      status = cudaDeviceSynchronize();
+      assert((status == cudaSuccess));
+    }
+  }
+
+  // ------------
+
+  status = cudaFree(w_memory);
+  assert((status == cudaSuccess));
+
+  status = cudaFree(h_temp_memory);
+  assert((status == cudaSuccess));
+
+  status = cudaFree(h_sum_memory);
+  assert((status == cudaSuccess));
+
+  status = cudaFree(epsilon_subsegment_memory);
+  assert((status == cudaSuccess));
+
+  status = cudaFree(epsilon_scale_memory);
+  assert((status == cudaSuccess));
+
+  status = cudaFree(forget_memory);
+  assert((status == cudaSuccess));
+
+  return true;
+};
--- a/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.h
+++ b/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.h
@ -0,0 +1,114 @@
+#ifndef HDYNAMICCNNMANYIP
+#define HDYNAMICCNNMANYIP
+
+#include <cuda.h>
+#include <unistd.h>
+
+#include <cctype>
+#include <iostream>
+#include <vector>
+
+#define ID_KERNEL_PHXY_PLUS_PHXY 0
+#define ID_KERNEL_PXY_PLUS_V 1
+#define ID_KERNEL_PXY_TIMES_V 2
+#define ID_KERNEL_PHXY_FILL_WITH_H 3
+#define ID_KERNEL_PHXY_PLUS_PXY 4
+#define ID_KERNEL_PXY_RECIPROCAL 5
+#define ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W 6
+#define ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY 7
+#define ID_KERNEL_PXY_SET_TO_V 8
+#define ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY 9
+#define ID_KERNEL_PHXY_TIMES_PXY 10
+#define ID_KERNEL_PXY_TIME_PXY 11
+#define ID_KERNEL_APPROXIMATION_MULTIPLICATION 12
+#define ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY 13
+
+#define H_DYNAMIC_NUMBER_OF_KERNELS 14
+#define H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS 7
+
+class HDynamicCNNManyIP {
+ public:
+  HDynamicCNNManyIP();
+  ~HDynamicCNNManyIP();
+
+  bool update_entrypoint(int64_t h_pointer_addr, int64_t h_dim_0,
+                         int64_t h_dim_1, int64_t h_dim_2, int64_t h_dim_3,
+                         int64_t epsilon_xy_pointer_addr,
+                         int64_t epsilon_xy_dim_0, int64_t epsilon_xy_dim_1,
+                         int64_t epsilon_xy_dim_2,
+                         int64_t epsilon_t_pointer_addr,
+                         int64_t epsilon_t_dim_0, int64_t weights_pointer_addr,
+                         int64_t weights_dim_0, int64_t weights_dim_1,
+                         int64_t input_pointer_addr, int64_t input_dim_0,
+                         int64_t input_dim_1, int64_t input_dim_2,
+                         int64_t input_dim_3, int64_t init_vector_pointer_addr,
+                         int64_t init_vector_dim_0, int64_t number_of_processes,
+                         float forgetting_offset, int64_t gpu_tuning_factor
+                         // ,bool approximation_multiplication_enable, uint64_t
+                         // number_of_frac_bits, bool approximation_enable,
+                         // uint64_t number_of_trunc_bits
+  );
+
+  void gpu_occupancy_export(size_t dim_x, size_t dim_y,
+                            size_t number_of_pattern, size_t h_dim,
+                            int64_t setting_memory_addr, size_t setting_dim_0,
+                            size_t setting_dim_1);
+
+  void gpu_occupancy_import(int64_t setting_memory_addr, size_t setting_dim_0,
+                            size_t setting_dim_1);
+
+ private:
+  bool update(float* h_init_ptr, float* h_pointer, size_t h_dim_c0,
+              size_t h_dim_c1, size_t h_dim_c2, size_t h_dim,
+              float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0,
+              size_t epsilon_xy_dim_c1, float* epsilon_t_pointer,
+              float* weights_pointer, size_t weights_dim_c0,
+              int64_t* input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+              size_t input_dim_c2, size_t number_of_spikes, size_t dim_x,
+              size_t dim_y, float forgetting_offset,
+              float forgetting_offset_local, size_t pattern_id,
+              bool approximation_multiplication_enable,
+              uint64_t number_of_frac_bits, bool approximation_enable,
+              uint64_t number_of_trunc_bits, uint32_t ap_mask);
+
+  void update_one_ip(float* h_init_ptr, float* h_pointer, size_t h_dim_c1,
+                     size_t h_dim, float* weights_pointer,
+                     size_t weights_dim_c0, int64_t* input_pointer,
+                     size_t input_dim_c1, float* epsilon_xy_pointer,
+                     size_t epsilon_xy_dim_c0, float* epsilon_t_pointer,
+                     size_t number_of_spikes, float forgetting_offset,
+                     float forgetting_offset_local);
+
+  void update_one_ip_approx(
+      float* h_init_ptr, float* h_pointer, size_t h_dim_c1, size_t h_dim,
+      float* weights_pointer, size_t weights_dim_c0, int64_t* input_pointer,
+      size_t input_dim_c1, float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0,
+      float* epsilon_t_pointer, size_t number_of_spikes,
+      float forgetting_offset, float forgetting_offset_local,
+      bool approximation_multiplication_enable, uint64_t number_of_frac_bits,
+      bool approximation_enable, uint64_t number_of_trunc_bits,
+      uint32_t ap_mask);
+
+  bool gpu_update(float* h_init_ptr, float* h_pointer, size_t h_dim_c0,
+                  size_t h_dim_c1, size_t h_dim_c2, size_t h_dim,
+                  float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0,
+                  size_t epsilon_xy_dim_c1, float* epsilon_t_pointer,
+                  float* weights_pointer, size_t weights_dim_c0,
+                  int64_t* input_pointer, size_t input_dim_c0,
+                  size_t input_dim_c1, size_t input_dim_c2,
+                  size_t number_of_spikes, size_t dim_x, size_t dim_y,
+                  float forgetting_offset, float forgetting_offset_local,
+                  size_t number_of_pattern, size_t gpu_tuning_factor,
+                  bool approximation_multiplication_enable,
+                  uint64_t number_of_frac_bits, bool approximation_enable,
+                  uint64_t number_of_trunc_bits, uint32_t ap_mask);
+
+  void gpu_occupancy_measure(size_t dim_x, size_t dim_y,
+                             size_t number_of_pattern, size_t h_dim);
+
+  bool grid_and_thread_calculated = false;
+  std::vector<std::vector<size_t>> grid_and_thread_settings;
+  bool display_debug = false;
+};
+
+#endif /* HDYNAMICCNNMANYIP */
--- a/network/CPP_Cuda_new_preview/Makefile
+++ b/network/CPP_Cuda_new_preview/Makefile
@ -0,0 +1,288 @@
+# Change to your python bin directory (tested with Python 3.10.4)
+PYBIN=~/P3.10GPU/bin/
+NVCC=/usr/local/cuda-12/bin/nvcc  
+CC=/usr/lib64/ccache/clang++
+
+PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes`
+PARAMETERS_O=--allow-unsupported-compiler \
+				-O3 -std=c++14 \
+				$(PYBIND11INCLUDE) \
+				-ccbin=$(CC) \
+                -Xcompiler "-fPIC -Wall -fopenmp=libomp" \
+				--gpu-architecture=sm_86 \
+				--generate-line-info
+
+PARAMETERS_Linker=--allow-unsupported-compiler \
+					-Xcompiler "-shared -lm -lomp -lstdc++ -Wall" \
+					--gpu-architecture=sm_86 \
+					--generate-line-info
+
+PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix`
+
+
+all: PyHDynamicCNNManyIP \
+		PySpikeGeneration2DManyIP \
+		PyMultiApp \
+		PyTestKernel
+
+#######################
+
+HDynamicCNNManyIP.o: \
+		HDynamicCNNManyIP.h \
+		HDynamicCNNManyIP.cu \
+		kernel_helper_functions.h \
+		kernel_phxy_plus_pxy.h \
+		kernel_pxy_plus_v.h \
+		kernel_pxy_time_pxy.h \
+		kernel_phxy_fill_with_h.h \
+		kernel_phxy_times_phxy_equals_phxy.h \
+		kernel_pxy_reciprocal.h \
+		kernel_pxy_times_v.h \
+		kernel_phxy_plus_phxy.h \
+		kernel_phxy_times_pxy.h \
+		kernel_pxy_set_to_v.h \
+		kernel_phxy_one_over_sum_into_pxy.h \
+		approximation_multiplication_function.h \
+		kernel_approximation_multiplication.h \
+		kernel_pxy_times_spike_selected_sxy.h \
+		kernel_phxy_fill_with_spike_selected_w.h
+	$(NVCC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cu -o HDynamicCNNManyIP.o
+
+PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp 
+	$(NVCC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o
+
+PyHDynamicCNNManyIP: \
+		HDynamicCNNManyIP.o \
+		PyHDynamicCNNManyIP.o \
+		kernel_pxy_plus_v.o \
+		kernel_pxy_set_to_v.o \
+		kernel_pxy_reciprocal.o \
+		kernel_pxy_time_pxy.o \
+		kernel_pxy_times_v.o \
+		kernel_phxy_times_phxy_equals_phxy.o \
+		kernel_phxy_plus_phxy.o \
+		kernel_phxy_plus_pxy.o \
+		kernel_phxy_times_pxy.o \
+		kernel_phxy_fill_with_h.o\
+		kernel_helper_functions.o\
+		kernel_phxy_one_over_sum_into_pxy.o \
+		approximation_multiplication_function.o \
+		error_term.o \
+		kernel_approximation_multiplication.o \
+		kernel_pxy_times_spike_selected_sxy.o \
+		kernel_phxy_fill_with_spike_selected_w.o
+	$(NVCC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP \
+		HDynamicCNNManyIP.o \
+		PyHDynamicCNNManyIP.o \
+		kernel_pxy_plus_v.o \
+		kernel_pxy_set_to_v.o \
+		kernel_pxy_reciprocal.o \
+		kernel_pxy_time_pxy.o \
+		kernel_pxy_times_v.o \
+		kernel_phxy_times_phxy_equals_phxy.o \
+		kernel_phxy_plus_phxy.o \
+		kernel_phxy_plus_pxy.o \
+		kernel_phxy_times_pxy.o \
+		kernel_phxy_fill_with_h.o \
+		kernel_helper_functions.o \
+		kernel_phxy_one_over_sum_into_pxy.o \
+		approximation_multiplication_function.o \
+		error_term.o \
+		kernel_approximation_multiplication.o \
+		kernel_pxy_times_spike_selected_sxy.o \
+		kernel_phxy_fill_with_spike_selected_w.o		
+	cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX)
+	$(PYBIN)python3 pybind11_auto_pyi.py
+
+#######################
+
+SpikeGeneration2DManyIP.o: \
+		SpikeGeneration2DManyIP.h \
+		SpikeGeneration2DManyIP.cu \
+		kernel_spike_generation.h
+	$(NVCC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cu -o SpikeGeneration2DManyIP.o
+
+PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp 
+	$(NVCC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o
+
+PySpikeGeneration2DManyIP: \
+		SpikeGeneration2DManyIP.o \
+		PySpikeGeneration2DManyIP.o \
+		kernel_helper_functions.o \
+		kernel_spike_generation.o
+	$(NVCC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP \
+		SpikeGeneration2DManyIP.o \
+		PySpikeGeneration2DManyIP.o \
+		kernel_helper_functions.o \
+		kernel_spike_generation.o
+	cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX)
+	$(PYBIN)python3 pybind11_auto_pyi.py
+
+
+#######################
+
+MultiApp.o: \
+		MultiApp.h \
+		MultiApp.cu \
+		approximation_multiplication_function.h \
+        kernel_approximation_multiplication.h\
+		error_term.cpp
+	$(NVCC) $(PARAMETERS_O) -c MultiApp.cu -o MultiApp.o
+
+PyMultiApp.o: MultiApp.h PyMultiApp.cpp
+	$(NVCC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o
+
+PyMultiApp: \
+		MultiApp.o\
+		PyMultiApp.o \
+		approximation_multiplication_function.o \
+		error_term.o \
+		kernel_approximation_multiplication.o \
+		kernel_helper_functions.o
+	$(NVCC) $(PARAMETERS_Linker) -o PyMultiApp \
+		MultiApp.o\
+		PyMultiApp.o \
+		approximation_multiplication_function.o \
+		error_term.o \
+		kernel_approximation_multiplication.o \
+		kernel_helper_functions.o	
+	cp PyMultiApp PyMultiApp$(PYPOSTFIX)
+	$(PYBIN)python3 pybind11_auto_pyi.py
+
+#######################
+clean:
+	rm -f PyHDynamicCNNManyIP
+	rm -f PySpikeGeneration2DManyIP
+	rm -f PyMultiApp
+	rm -f PyTestKernel
+	rm -f *.o
+	rm -f *.so
+
+#######################
+
+TestKernel.o: \
+		TestKernel.cu \
+		TestKernel.h \
+		kernel_helper_functions.h \
+		kernel_phxy_plus_pxy.h \
+		kernel_pxy_plus_v.h \
+		kernel_pxy_time_pxy.h \
+		kernel_phxy_fill_with_h.h \
+		kernel_phxy_times_phxy_equals_phxy.h \
+		kernel_pxy_reciprocal.h \
+		kernel_pxy_times_v.h \
+		kernel_phxy_plus_phxy.h \
+		kernel_phxy_times_pxy.h \
+		kernel_pxy_set_to_v.h\
+		kernel_phxy_one_over_sum_into_pxy.h\
+		kernel_pxy_times_spike_selected_sxy.h \
+		kernel_phxy_fill_with_spike_selected_w.h
+	$(NVCC) $(PARAMETERS_O) -c TestKernel.cu -o TestKernel.o
+
+PyTestKernel.o: PyTestKernel.cpp TestKernel.h
+	$(NVCC) $(PARAMETERS_O) -c PyTestKernel.cpp -o PyTestKernel.o
+
+PyTestKernel: \
+		TestKernel.o \
+		PyTestKernel.o \
+		kernel_pxy_plus_v.o \
+		kernel_pxy_set_to_v.o \
+		kernel_pxy_reciprocal.o \
+		kernel_pxy_time_pxy.o \
+		kernel_pxy_times_v.o \
+		kernel_phxy_times_phxy_equals_phxy.o \
+		kernel_phxy_plus_phxy.o \
+		kernel_phxy_plus_pxy.o \
+		kernel_phxy_times_pxy.o \
+		kernel_phxy_fill_with_h.o\
+		kernel_helper_functions.o\
+		kernel_phxy_one_over_sum_into_pxy.o\
+		kernel_pxy_times_spike_selected_sxy.o \
+		kernel_phxy_fill_with_spike_selected_w.o
+
+	$(NVCC) $(PARAMETERS_Linker) -o PyTestKernel \
+		TestKernel.o \
+		PyTestKernel.o \
+		kernel_pxy_plus_v.o \
+		kernel_pxy_set_to_v.o \
+		kernel_pxy_reciprocal.o \
+		kernel_pxy_time_pxy.o \
+		kernel_pxy_times_v.o \
+		kernel_phxy_times_phxy_equals_phxy.o \
+		kernel_phxy_plus_phxy.o \
+		kernel_phxy_plus_pxy.o \
+		kernel_phxy_times_pxy.o \
+		kernel_phxy_fill_with_h.o \
+		kernel_helper_functions.o \
+		kernel_phxy_one_over_sum_into_pxy.o \
+		kernel_pxy_times_spike_selected_sxy.o \
+		kernel_phxy_fill_with_spike_selected_w.o	
+	cp PyTestKernel PyTestKernel$(PYPOSTFIX)
+	$(PYBIN)python3 pybind11_auto_pyi.py
+
+kernel_pxy_plus_v.o: kernel_pxy_plus_v.cu kernel_pxy_plus_v.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_pxy_plus_v.cu -o kernel_pxy_plus_v.o
+
+kernel_pxy_set_to_v.o: kernel_pxy_set_to_v.cu kernel_pxy_set_to_v.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_pxy_set_to_v.cu -o kernel_pxy_set_to_v.o
+
+kernel_pxy_reciprocal.o: kernel_pxy_reciprocal.cu kernel_pxy_reciprocal.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_pxy_reciprocal.cu -o kernel_pxy_reciprocal.o
+
+kernel_pxy_time_pxy.o: kernel_pxy_time_pxy.cu kernel_pxy_time_pxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_pxy_time_pxy.cu -o kernel_pxy_time_pxy.o
+
+kernel_pxy_times_v.o: kernel_pxy_times_v.cu kernel_pxy_times_v.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_pxy_times_v.cu -o kernel_pxy_times_v.o
+
+kernel_phxy_times_phxy_equals_phxy.o: kernel_phxy_times_phxy_equals_phxy.cu kernel_phxy_times_phxy_equals_phxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_times_phxy_equals_phxy.cu -o kernel_phxy_times_phxy_equals_phxy.o
+
+kernel_phxy_plus_phxy.o: kernel_phxy_plus_phxy.cu kernel_phxy_plus_phxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_plus_phxy.cu -o kernel_phxy_plus_phxy.o
+
+kernel_phxy_plus_pxy.o: kernel_phxy_plus_pxy.cu kernel_phxy_plus_pxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_plus_pxy.cu -o kernel_phxy_plus_pxy.o
+
+kernel_phxy_times_pxy.o: kernel_phxy_times_pxy.cu kernel_phxy_times_pxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_times_pxy.cu -o kernel_phxy_times_pxy.o
+
+kernel_phxy_fill_with_h.o: kernel_phxy_fill_with_h.cu kernel_phxy_fill_with_h.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_fill_with_h.cu -o kernel_phxy_fill_with_h.o
+
+kernel_helper_functions.o: kernel_helper_functions.cu kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_helper_functions.cu -o kernel_helper_functions.o
+
+kernel_phxy_one_over_sum_into_pxy.o: kernel_phxy_one_over_sum_into_pxy.cu kernel_phxy_one_over_sum_into_pxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_one_over_sum_into_pxy.cu -o kernel_phxy_one_over_sum_into_pxy.o
+
+kernel_phxy_fill_with_spike_selected_w.o: kernel_phxy_fill_with_spike_selected_w.cu kernel_phxy_fill_with_spike_selected_w.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_phxy_fill_with_spike_selected_w.cu -o kernel_phxy_fill_with_spike_selected_w.o 
+
+kernel_spike_generation.o: kernel_spike_generation.cu kernel_spike_generation.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_spike_generation.cu -o kernel_spike_generation.o 
+
+kernel_approximation_multiplication.o: \
+		kernel_approximation_multiplication.cu \
+		kernel_approximation_multiplication.h \
+		kernel_helper_functions.h \
+		kernel_approximation_error_term.cu
+	$(NVCC) $(PARAMETERS_O) -c kernel_approximation_multiplication.cu -o kernel_approximation_multiplication.o 
+
+approximation_multiplication_function.o: \
+		approximation_multiplication_function.cpp \
+		approximation_multiplication_function.h \
+		error_term.h
+	$(NVCC) $(PARAMETERS_O) -c approximation_multiplication_function.cpp -o approximation_multiplication_function.o 
+
+error_term.o: error_term.cpp error_term.h
+	$(NVCC) $(PARAMETERS_O) -c error_term.cpp -o error_term.o 
+
+kernel_pxy_times_spike_selected_sxy.o: kernel_pxy_times_spike_selected_sxy.cu kernel_pxy_times_spike_selected_sxy.h kernel_helper_functions.h
+	$(NVCC) $(PARAMETERS_O) -c kernel_pxy_times_spike_selected_sxy.cu -o kernel_pxy_times_spike_selected_sxy.o 
+
+
+# .o: .cu .h kernel_helper_functions.h
+# 	$(NVCC) $(PARAMETERS_O) -c .cu -o .o 
+
+
--- a/network/CPP_Cuda_new_preview/MultiApp.cu
+++ b/network/CPP_Cuda_new_preview/MultiApp.cu
@ -0,0 +1,273 @@
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#include "MultiApp.h"
+#include "approximation_multiplication_function.h"
+#include "kernel_approximation_multiplication.h"
+
+MultiApp::MultiApp(){
+
+};
+
+MultiApp::~MultiApp(){
+
+};
+
+bool MultiApp::update(float* np_input_pointer, float* np_weight_pointer,
+                      float* np_output_pointer, int64_t pattern_dim,
+                      int64_t feature_dim, int64_t x_dim, int64_t y_dim,
+                      int64_t input_channel_dim, int64_t id_pattern,
+                      bool approximation_enable, int64_t number_of_trunc_bits,
+                      int64_t number_of_frac_bits) {
+  assert((id_pattern >= 0));
+  assert((id_pattern < pattern_dim));
+
+  float* np_input_pointer_pattern;
+  float* np_output_pointer_pattern;
+
+  float* input_ptr;
+  float* output_ptr;
+  float* w_ptr;
+
+  uint64_t pattern_size = input_channel_dim;
+
+  std::vector<float> ap_h_vector;
+  ap_h_vector.resize(pattern_size);
+  float* ap_h_ptr = ap_h_vector.data();
+
+  std::vector<uint32_t> ap_x_vector;
+  ap_x_vector.resize(pattern_size);
+  uint32_t* ap_x_ptr = ap_x_vector.data();
+
+  std::vector<uint32_t> ap_y_vector;
+  ap_y_vector.resize(pattern_size);
+  uint32_t* ap_y_ptr = ap_y_vector.data();
+
+  std::vector<uint32_t> ap_x_exponent_vector;
+  ap_x_exponent_vector.resize(pattern_size);
+  uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data();
+
+  std::vector<uint32_t> ap_y_exponent_vector;
+  ap_y_exponent_vector.resize(pattern_size);
+  uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data();
+
+  std::vector<uint32_t> ap_h_exponent_vector;
+  ap_h_exponent_vector.resize(pattern_size);
+  uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data();
+
+  std::vector<uint64_t> ap_res_vector;
+  ap_res_vector.resize(pattern_size);
+  uint64_t* ap_res_ptr = ap_res_vector.data();
+
+  uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
+
+  std::vector<uint32_t> sign_temp_vector;
+  sign_temp_vector.resize(pattern_size);
+  uint32_t* sign_temp_ptr = sign_temp_vector.data();
+
+  uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim;
+  uint64_t output_pattern_size = feature_dim * x_dim * y_dim;
+
+  np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size;
+  np_output_pointer_pattern =
+      np_output_pointer + id_pattern * output_pattern_size;
+
+  uint64_t counter;
+
+  uint64_t counter_x;
+  uint64_t counter_y;
+  uint64_t counter_feature;
+  uint64_t pos_xy;
+  uint64_t pos_xy_if;
+
+  float temp_sum;
+
+  uint64_t pattern_c_2 = x_dim * y_dim;
+
+  for (counter_x = 0; counter_x < x_dim; counter_x++) {
+    for (counter_y = 0; counter_y < y_dim; counter_y++) {
+      pos_xy = counter_y + counter_x * y_dim;
+      for (counter_feature = 0; counter_feature < feature_dim;
+           counter_feature++) {
+        pos_xy_if = counter_feature * pattern_c_2 + pos_xy;
+
+        input_ptr = np_input_pointer_pattern + pos_xy;
+        output_ptr = np_output_pointer_pattern + pos_xy_if;
+        w_ptr = np_weight_pointer + counter_feature * input_channel_dim;
+
+#pragma omp simd
+        for (counter = 0; counter < pattern_size; counter++) {
+          ap_h_ptr[counter] = input_ptr[counter * pattern_c_2];
+        }
+
+        approximation_multiplication_function(
+            ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits,
+            number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr,
+            ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr,
+            sign_temp_ptr, approximation_enable);
+
+        temp_sum = 0.0;
+#pragma omp simd reduction(+ : temp_sum)
+        for (counter = 0; counter < pattern_size; counter++) {
+          temp_sum += ap_h_ptr[counter];
+        }
+
+        output_ptr[0] = temp_sum;
+      }
+    }
+  }
+
+  return true;
+};
+
+bool MultiApp::update_entrypoint(
+    int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
+    int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
+    int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
+    int64_t number_of_processes, bool approximation_enable,
+    int64_t number_of_trunc_bits, int64_t number_of_frac) {
+  int64_t number_of_pattern = pattern_dim;
+  int64_t pattern_id;
+
+  float* np_input_pointer = (float*)np_input_pointer_addr;
+  float* np_weight_pointer = (float*)np_weight_pointer_addr;
+  float* np_output_pointer = (float*)np_output_pointer_addr;
+
+  assert((np_input_pointer != nullptr));
+  assert((np_output_pointer != nullptr));
+  assert((np_weight_pointer != nullptr));
+
+  assert((pattern_dim > 0));
+  assert((feature_dim > 0));
+  assert((x_dim > 0));
+  assert((y_dim > 0));
+  assert((input_channel_dim > 0));
+
+  if (number_of_processes > 0) {
+    omp_set_num_threads(number_of_processes);
+    // For debugging: Only one thread
+    // omp_set_num_threads(1);
+
+#pragma omp parallel for
+    for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) {
+      update(np_input_pointer, np_weight_pointer, np_output_pointer,
+             pattern_dim, feature_dim, x_dim, y_dim, input_channel_dim,
+             pattern_id, approximation_enable, number_of_trunc_bits,
+             number_of_frac);
+    }
+  } else {
+    update_gpu(np_input_pointer, np_weight_pointer, np_output_pointer,
+               pattern_dim, feature_dim, x_dim, y_dim, input_channel_dim,
+               approximation_enable, number_of_trunc_bits, number_of_frac);
+  }
+  return true;
+};
+
+void MultiApp::gpu_occupancy_measure(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim) {
+  grid_and_thread_calculated = false;
+  assert((dim_x < 65535));
+  assert((dim_y < 65535));
+
+  grid_and_thread_settings.resize(1);
+
+  occupancy_kernel_approximation_multiplication(
+      dim_x, dim_y, number_of_pattern, h_dim, grid_and_thread_settings[0],
+      display_debug);
+
+  grid_and_thread_calculated = true;
+  return;
+};
+
+void MultiApp::gpu_occupancy_export(size_t dim_x, size_t dim_y,
+                                    size_t number_of_pattern, size_t h_dim,
+                                    int64_t setting_memory_addr,
+                                    size_t setting_dim_0,
+                                    size_t setting_dim_1) {
+  int64_t* setting_memory = (int64_t*)setting_memory_addr;
+
+  assert((setting_memory != nullptr));
+  assert((setting_dim_1 == APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS));
+
+  gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim);
+  assert((grid_and_thread_calculated == true));
+
+  assert((setting_dim_0 == grid_and_thread_settings.size()));
+
+  for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) {
+    for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) {
+      setting_memory[counter_0 * setting_dim_1 + counter_1] =
+          grid_and_thread_settings[counter_0][counter_1];
+    }
+  }
+};
+
+void MultiApp::gpu_occupancy_import(int64_t setting_memory_addr,
+                                    size_t setting_dim_0,
+                                    size_t setting_dim_1) {
+  grid_and_thread_calculated = false;
+
+  int64_t* setting_memory = (int64_t*)setting_memory_addr;
+
+  assert((setting_memory != nullptr));
+  assert((setting_dim_1 == APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS));
+  assert((setting_dim_0 == APPROXI_MULTI_NUMBER_OF_KERNELS));
+
+  grid_and_thread_settings.resize(APPROXI_MULTI_NUMBER_OF_KERNELS);
+
+  for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) {
+    grid_and_thread_settings[counter_0].resize(
+        APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS);
+
+    for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) {
+      grid_and_thread_settings[counter_0][counter_1] =
+          setting_memory[counter_0 * setting_dim_1 + counter_1];
+    }
+  }
+
+  grid_and_thread_calculated = true;
+};
+
+void MultiApp::update_gpu(float* np_input_pointer, float* np_weight_pointer,
+                          float* np_output_pointer, uint64_t pattern_dim,
+                          uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
+                          uint64_t input_channel_dim, bool approximation_enable,
+                          uint64_t number_of_trunc_bits,
+                          uint64_t number_of_frac_bits) {
+  if (grid_and_thread_calculated == false) {
+    gpu_occupancy_measure(x_dim, y_dim, pattern_dim, feature_dim);
+  }
+  assert((grid_and_thread_calculated == true));
+
+  uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
+  // std::cout << approximation_enable << std::endl;
+  // std::cout << number_of_trunc_bits << std::endl;
+  // std::cout << number_of_frac_bits << std::endl;
+
+  cudaError_t status;
+
+  size_t pfxy_block_dim_c0 = feature_dim * x_dim * y_dim;
+  size_t pfxy_block_dim_c1 = x_dim * y_dim;
+  size_t pfxy_block_dim_c2 = y_dim;
+
+  kernel_approximation_multiplication<<<
+      dim3(grid_and_thread_settings[0][0], grid_and_thread_settings[0][1],
+           grid_and_thread_settings[0][2]),
+      dim3(grid_and_thread_settings[0][3], grid_and_thread_settings[0][4],
+           grid_and_thread_settings[0][5])>>>(
+      np_input_pointer, np_weight_pointer, np_output_pointer, pattern_dim,
+      feature_dim, x_dim, y_dim, input_channel_dim,
+      grid_and_thread_settings[0][6], (x_dim * y_dim), number_of_frac_bits,
+      approximation_enable, number_of_trunc_bits, ap_mask, pfxy_block_dim_c0,
+      pfxy_block_dim_c1, pfxy_block_dim_c2);
+
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
--- a/network/CPP_Cuda_new_preview/MultiApp.h
+++ b/network/CPP_Cuda_new_preview/MultiApp.h
@ -0,0 +1,60 @@
+#ifndef MULTIAPP
+#define MULTIAPP
+
+#include <unistd.h>
+
+#include <cctype>
+#include <iostream>
+#include <vector>
+
+#define APPROXI_MULTI_NUMBER_OF_KERNELS 1
+#define APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS 7
+
+class MultiApp
+{
+    public:
+    MultiApp();
+    ~MultiApp();
+
+
+    bool update_entrypoint(
+        int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
+        int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
+        int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
+        int64_t number_of_processes, bool approximation_enable,
+        int64_t number_of_trunc_bits, int64_t number_of_frac);
+
+    void gpu_occupancy_export(size_t dim_x, size_t dim_y, size_t number_of_pattern,
+        size_t h_dim, int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1);
+
+    void gpu_occupancy_import(
+        int64_t setting_memory_addr,
+        size_t setting_dim_0,
+        size_t setting_dim_1);
+
+    private:
+
+    bool update(float* np_input_pointer, float* np_weight_pointer,
+        float* np_output_pointer, int64_t pattern_dim,
+        int64_t feature_dim, int64_t x_dim, int64_t y_dim,
+        int64_t input_channel_dim, int64_t id_pattern,
+        bool approximation_enable, int64_t number_of_trunc_bits,
+        int64_t number_of_frac);
+
+    void update_gpu(float* input_pointer, float* weight_pointer,
+        float* output_pointer, uint64_t pattern_dim,
+        uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
+        uint64_t input_channel_dim,
+        bool approximation_enable, uint64_t number_of_trunc_bits,
+        uint64_t number_of_frac);
+
+    void gpu_occupancy_measure(size_t dim_x, size_t dim_y, size_t number_of_pattern,
+        size_t h_dim);
+
+    bool grid_and_thread_calculated = false;
+    std::vector<std::vector<size_t>> grid_and_thread_settings;
+    bool display_debug = false;
+
+};
+
+#endif /* MULTIAPP */
--- a/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.cpp
+++ b/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.cpp
@ -0,0 +1,18 @@
+#include <pybind11/pybind11.h>
+
+#include "HDynamicCNNManyIP.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(PyHDynamicCNNManyIP, m)
+{
+    m.doc() = "HDynamicCNNManyIP Module";
+    py::class_<HDynamicCNNManyIP>(m, "HDynamicCNNManyIP")
+        .def(py::init<>())
+        .def("gpu_occupancy_export",
+            &HDynamicCNNManyIP::gpu_occupancy_export)
+        .def("gpu_occupancy_import",
+            &HDynamicCNNManyIP::gpu_occupancy_import)
+        .def("update",
+            &HDynamicCNNManyIP::update_entrypoint);
+}
--- a/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.pyi
+++ b/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.pyi
@ -0,0 +1,20 @@
+#
+# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
+#
+
+"""HDynamicCNNManyIP Module"""
+from __future__ import annotations
+import PyHDynamicCNNManyIP
+import typing
+
+__all__ = [
+    "HDynamicCNNManyIP"
+]
+
+
+class HDynamicCNNManyIP():
+    def __init__(self) -> None: ...
+    def gpu_occupancy_export(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int) -> None: ...
+    def gpu_occupancy_import(self, arg0: int, arg1: int, arg2: int) -> None: ...
+    def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ...
+    pass
--- a/network/CPP_Cuda_new_preview/PyMultiApp.cpp
+++ b/network/CPP_Cuda_new_preview/PyMultiApp.cpp
@ -0,0 +1,15 @@
+
+#include <pybind11/pybind11.h>
+
+#include "MultiApp.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(PyMultiApp, m) {
+  m.doc() = "MultiApp Module";
+  py::class_<MultiApp>(m, "MultiApp")
+      .def(py::init<>())
+      .def("gpu_occupancy_export", &MultiApp::gpu_occupancy_export)
+      .def("gpu_occupancy_import", &MultiApp::gpu_occupancy_import)
+      .def("update_entrypoint", &MultiApp::update_entrypoint);
+}
--- a/network/CPP_Cuda_new_preview/PyMultiApp.pyi
+++ b/network/CPP_Cuda_new_preview/PyMultiApp.pyi
@ -0,0 +1,20 @@
+#
+# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
+#
+
+"""MultiApp Module"""
+from __future__ import annotations
+import PyMultiApp
+import typing
+
+__all__ = [
+    "MultiApp"
+]
+
+
+class MultiApp():
+    def __init__(self) -> None: ...
+    def gpu_occupancy_export(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int) -> None: ...
+    def gpu_occupancy_import(self, arg0: int, arg1: int, arg2: int) -> None: ...
+    def update_entrypoint(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ...
+    pass
--- a/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.cpp
+++ b/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.cpp
@ -0,0 +1,19 @@
+
+#include <pybind11/pybind11.h>
+
+#include "SpikeGeneration2DManyIP.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(PySpikeGeneration2DManyIP, m)
+{
+  m.doc() = "SpikeGeneration2DManyIP Module";
+  py::class_<SpikeGeneration2DManyIP>(m, "SpikeGeneration2DManyIP")
+    .def(py::init<>())
+    .def("gpu_occupancy_export",
+      &SpikeGeneration2DManyIP::gpu_occupancy_export)
+    .def("gpu_occupancy_import",
+      &SpikeGeneration2DManyIP::gpu_occupancy_import)
+    .def("spike_generation",
+      &SpikeGeneration2DManyIP::spike_generation_entrypoint);
+}
--- a/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.pyi
+++ b/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.pyi
@ -0,0 +1,20 @@
+#
+# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
+#
+
+"""SpikeGeneration2DManyIP Module"""
+from __future__ import annotations
+import PySpikeGeneration2DManyIP
+import typing
+
+__all__ = [
+    "SpikeGeneration2DManyIP"
+]
+
+
+class SpikeGeneration2DManyIP():
+    def __init__(self) -> None: ...
+    def gpu_occupancy_export(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int) -> None: ...
+    def gpu_occupancy_import(self, arg0: int, arg1: int, arg2: int) -> None: ...
+    def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ...
+    pass
--- a/network/CPP_Cuda_new_preview/PyTestKernel.cpp
+++ b/network/CPP_Cuda_new_preview/PyTestKernel.cpp
@ -0,0 +1,37 @@
+
+#include <pybind11/pybind11.h>
+
+#include "TestKernel.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(PyTestKernel, m) {
+  m.doc() = "TestKernel Module";
+  py::class_<TestKernel>(m, "TestKernel")
+      .def(py::init<>())
+
+      .def("test_kernel_pxy_times_spike_selected_sxy",
+           &TestKernel::test_kernel_pxy_times_spike_selected_sxy)
+
+      .def("test_kernel_phxy_fill_with_spike_selected_w",
+           &TestKernel::test_kernel_phxy_fill_with_spike_selected_w)
+      .def("test_kernel_phxy_plus_pxy", &TestKernel::test_kernel_phxy_plus_pxy)
+      .def("test_kernel_phxy_fill_with_h",
+           &TestKernel::test_kernel_phxy_fill_with_h)
+      .def("test_kernel_phxy_times_pxy",
+           &TestKernel::test_kernel_phxy_times_pxy)
+      .def("test_kernel_phxy_one_over_sum_into_pxy",
+           &TestKernel::test_kernel_phxy_one_over_sum_into_pxy)
+
+      .def("test_kernel_phxy_plus_phxy",
+           &TestKernel::test_kernel_phxy_plus_phxy)
+      .def("test_kernel_phxy_times_phxy_equals_phxy",
+           &TestKernel::test_kernel_phxy_times_phxy_equals_phxy)
+
+      .def("test_kernel_pxy_time_pxy", &TestKernel::test_kernel_pxy_time_pxy)
+      .def("test_kernel_pxy_reciprocal",
+           &TestKernel::test_kernel_pxy_reciprocal)
+      .def("test_kernel_pxy_plus_v", &TestKernel::test_kernel_pxy_plus_v)
+      .def("test_kernel_pxy_times_v", &TestKernel::test_kernel_pxy_times_v)
+      .def("test_kernel_pxy_set_to_v", &TestKernel::test_kernel_pxy_set_to_v);
+}
--- a/network/CPP_Cuda_new_preview/PyTestKernel.pyi
+++ b/network/CPP_Cuda_new_preview/PyTestKernel.pyi
@ -0,0 +1,30 @@
+#
+# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
+#
+
+"""TestKernel Module"""
+from __future__ import annotations
+import PyTestKernel
+import typing
+
+__all__ = [
+    "TestKernel"
+]
+
+
+class TestKernel():
+    def __init__(self) -> None: ...
+    def test_kernel_phxy_fill_with_h(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int) -> None: ...
+    def test_kernel_phxy_fill_with_spike_selected_w(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> None: ...
+    def test_kernel_phxy_one_over_sum_into_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int) -> None: ...
+    def test_kernel_phxy_plus_phxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int) -> None: ...
+    def test_kernel_phxy_plus_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int) -> None: ...
+    def test_kernel_phxy_times_phxy_equals_phxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int) -> None: ...
+    def test_kernel_phxy_times_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int) -> None: ...
+    def test_kernel_pxy_plus_v(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: float, arg6: int) -> None: ...
+    def test_kernel_pxy_reciprocal(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int) -> None: ...
+    def test_kernel_pxy_set_to_v(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: float, arg6: int) -> None: ...
+    def test_kernel_pxy_time_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int) -> None: ...
+    def test_kernel_pxy_times_spike_selected_sxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> None: ...
+    def test_kernel_pxy_times_v(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: float, arg6: int) -> None: ...
+    pass
--- a/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.cu
+++ b/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.cu
@ -0,0 +1,261 @@
+#include <omp.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+
+#include "SpikeGeneration2DManyIP.h"
+#include "kernel_spike_generation.h"
+
+SpikeGeneration2DManyIP::SpikeGeneration2DManyIP(){
+
+};
+
+SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP(){
+
+};
+
+bool SpikeGeneration2DManyIP::spike_generation_entrypoint(
+    int64_t input_pointer_addr, int64_t input_dim_0, int64_t input_dim_1,
+    int64_t input_dim_2, int64_t input_dim_3,
+    int64_t random_values_pointer_addr, int64_t random_values_dim_0,
+    int64_t random_values_dim_1, int64_t random_values_dim_2,
+    int64_t random_values_dim_3, int64_t output_pointer_addr,
+    int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
+    int64_t output_dim_3, int64_t number_of_cpu_processes) {
+  float* input_pointer = (float*)input_pointer_addr;
+  float* random_values_pointer = (float*)random_values_pointer_addr;
+  int64_t* output_pointer = (int64_t*)output_pointer_addr;
+
+  // Input
+  assert((input_pointer != nullptr));
+  assert((input_dim_0 > 0));
+  assert((input_dim_1 > 0));
+  assert((input_dim_2 > 0));
+  assert((input_dim_3 > 0));
+
+  // Random
+  assert((random_values_pointer != nullptr));
+  assert((random_values_dim_0 > 0));
+  assert((random_values_dim_1 > 0));
+  assert((random_values_dim_2 > 0));
+  assert((random_values_dim_3 > 0));
+
+  // Output
+  assert((output_pointer != nullptr));
+  assert((output_dim_0 > 0));
+  assert((output_dim_1 > 0));
+  assert((output_dim_2 > 0));
+  assert((output_dim_3 > 0));
+
+  // Input
+  size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
+  size_t input_dim_c1 = input_dim_2 * input_dim_3;
+  size_t input_dim_c2 = input_dim_3;
+
+  // Random
+  size_t random_values_dim_c0 =
+      random_values_dim_1 * random_values_dim_2 * random_values_dim_3;
+  size_t random_values_dim_c1 = random_values_dim_2 * random_values_dim_3;
+  size_t random_values_dim_c2 = random_values_dim_3;
+
+  // Output
+  size_t output_dim_c0 = output_dim_1 * output_dim_2 * output_dim_3;
+  size_t output_dim_c1 = output_dim_2 * output_dim_3;
+  size_t output_dim_c2 = output_dim_3;
+
+  size_t number_of_pattern = input_dim_0;
+  size_t h_dim = input_dim_1;
+  size_t spike_dim = output_dim_1;
+  size_t x_dim = output_dim_2;
+  size_t y_dim = output_dim_2;
+
+  if (number_of_cpu_processes > 0) {
+    omp_set_num_threads(number_of_cpu_processes);
+    // DEBUG:
+    // omp_set_num_threads(1);
+
+    size_t pattern_id;
+
+#pragma omp parallel for
+    for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) {
+      spike_generation(
+          input_pointer, input_dim_c0, input_dim_c1, input_dim_c2,
+          random_values_pointer, random_values_dim_c0, random_values_dim_c1,
+          random_values_dim_c2, output_pointer, output_dim_c0, output_dim_c1,
+          output_dim_c2, x_dim, y_dim, spike_dim, h_dim, pattern_id);
+    }
+  } else {
+    gpu_spike_generation(
+        input_pointer, input_dim_c0, input_dim_c1, input_dim_c2,
+        random_values_pointer, random_values_dim_c0, random_values_dim_c1,
+        random_values_dim_c2, output_pointer, output_dim_c0, output_dim_c1,
+        output_dim_c2, x_dim, y_dim, spike_dim, h_dim, number_of_pattern);
+  }
+
+  return true;
+};
+
+bool SpikeGeneration2DManyIP::spike_generation(
+    float* input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+    size_t input_dim_c2, float* random_values_pointer,
+    size_t random_values_dim_c0, size_t random_values_dim_c1,
+    size_t random_values_dim_c2, int64_t* output_pointer, size_t output_dim_c0,
+    size_t output_dim_c1, size_t output_dim_c2, size_t x_dim, size_t y_dim,
+    size_t spike_dim, size_t h_dim, size_t pattern_id) {
+  size_t counter;
+  size_t counter_x = 0;
+  size_t counter_y = 0;
+
+  float* p_ptr = nullptr;
+  int64_t* out_ptr = nullptr;
+  float* rand_ptr = nullptr;
+
+  for (counter_x = 0; counter_x < x_dim; counter_x++) {
+    for (counter_y = 0; counter_y < y_dim; counter_y++) {
+      p_ptr = input_pointer + pattern_id * input_dim_c0 +
+              counter_x * input_dim_c2 + counter_y;
+      // + counter * input_dim_c1
+
+      out_ptr = output_pointer + pattern_id * output_dim_c0 +
+                counter_x * output_dim_c2 + counter_y;
+      // + counter * output_dim_c1
+
+      rand_ptr = random_values_pointer + pattern_id * random_values_dim_c0 +
+                 counter_x * random_values_dim_c2 + counter_y;
+      // + counter * random_values_dim_c1
+
+      for (counter = 0; counter < spike_dim; counter++) {
+        out_ptr[counter * output_dim_c1] =
+            lower_bound(p_ptr, h_dim, input_dim_c1,
+                        rand_ptr[counter * random_values_dim_c1]);
+      }
+    }
+  }
+
+  return true;
+};
+
+// algorithmic idea stolen from libc++
+size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr, size_t data_length,
+                                            size_t data_ptr_stride,
+                                            float compare_to_value) {
+  size_t start_of_range = 0;
+  size_t length_of_range = data_length;
+
+  while (length_of_range != 0) {
+    size_t half_length = length_of_range >> 1;
+    size_t actual_position = start_of_range + half_length;
+
+    if (data_ptr[actual_position * data_ptr_stride] < compare_to_value) {
+      start_of_range = ++actual_position;
+      length_of_range -= half_length + 1;
+    } else
+      length_of_range = half_length;
+  }
+  return start_of_range;
+};
+
+void SpikeGeneration2DManyIP::gpu_occupancy_measure(size_t dim_x, size_t dim_y,
+                                                    size_t number_of_pattern,
+                                                    size_t spike_dim) {
+  grid_and_thread_calculated = false;
+  assert((dim_x < 65535));
+  assert((dim_y < 65535));
+
+  grid_and_thread_settings.resize(1);
+
+  occupancy_kernel_spike_generation(dim_x, dim_y, number_of_pattern, spike_dim,
+                                    grid_and_thread_settings[0], display_debug);
+
+  grid_and_thread_calculated = true;
+  return;
+};
+
+void SpikeGeneration2DManyIP::gpu_occupancy_export(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t spike_dim,
+    int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1) {
+  int64_t* setting_memory = (int64_t*)setting_memory_addr;
+
+  assert((setting_memory != nullptr));
+  assert((setting_dim_1 == SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS));
+
+  gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, spike_dim);
+  assert((grid_and_thread_calculated == true));
+  assert(
+      (grid_and_thread_settings.size() == SPIKE_GENERATION_NUMBER_OF_KERNELS));
+
+  assert((setting_dim_0 == grid_and_thread_settings.size()));
+
+  for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) {
+    for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) {
+      setting_memory[counter_0 * setting_dim_1 + counter_1] =
+          grid_and_thread_settings[counter_0][counter_1];
+    }
+  }
+};
+
+void SpikeGeneration2DManyIP::gpu_occupancy_import(int64_t setting_memory_addr,
+                                                   size_t setting_dim_0,
+                                                   size_t setting_dim_1) {
+  grid_and_thread_calculated = false;
+
+  int64_t* setting_memory = (int64_t*)setting_memory_addr;
+
+  assert((setting_memory != nullptr));
+  assert((setting_dim_1 == SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS));
+  assert((setting_dim_0 == SPIKE_GENERATION_NUMBER_OF_KERNELS));
+
+  grid_and_thread_settings.resize(SPIKE_GENERATION_NUMBER_OF_KERNELS);
+
+  for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) {
+    grid_and_thread_settings[counter_0].resize(
+        SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS);
+
+    for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) {
+      grid_and_thread_settings[counter_0][counter_1] =
+          setting_memory[counter_0 * setting_dim_1 + counter_1];
+    }
+  }
+
+  grid_and_thread_calculated = true;
+};
+
+bool SpikeGeneration2DManyIP::gpu_spike_generation(
+    float* input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+    size_t input_dim_c2, float* random_values_pointer,
+    size_t random_values_dim_c0, size_t random_values_dim_c1,
+    size_t random_values_dim_c2, int64_t* output_pointer, size_t output_dim_c0,
+    size_t output_dim_c1, size_t output_dim_c2, size_t x_dim, size_t y_dim,
+    size_t spike_dim, size_t h_dim, size_t number_of_pattern) {
+  if (grid_and_thread_calculated == false) {
+    gpu_occupancy_measure(x_dim, y_dim, number_of_pattern, spike_dim);
+  }
+  assert((grid_and_thread_calculated == true));
+
+  cudaError_t status;
+  assert((x_dim < 65535));
+  assert((y_dim < 65535));
+
+  size_t psxy_block_dim_c0 = spike_dim * x_dim * y_dim;
+  size_t psxy_block_dim_c1 = x_dim * y_dim;
+  size_t psxy_block_dim_c2 = y_dim;
+
+  kernel_spike_generation<<<
+      dim3(grid_and_thread_settings[0][0], grid_and_thread_settings[0][1],
+           grid_and_thread_settings[0][2]),
+      dim3(grid_and_thread_settings[0][3], grid_and_thread_settings[0][4],
+           grid_and_thread_settings[0][5])>>>(
+      input_pointer, input_dim_c0, input_dim_c1, input_dim_c2,
+      random_values_pointer, random_values_dim_c0, random_values_dim_c1,
+      random_values_dim_c2, output_pointer, output_dim_c0, output_dim_c1,
+      output_dim_c2, x_dim, y_dim, spike_dim, h_dim, psxy_block_dim_c0,
+      psxy_block_dim_c1, psxy_block_dim_c2, grid_and_thread_settings[0][6]);
+
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+
+  return true;
+};
--- a/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.h
+++ b/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.h
@ -0,0 +1,87 @@
+#ifndef SPIKEGENERATION2DMANYIP
+#define SPIKEGENERATION2DMANYIP
+
+#include <unistd.h>
+
+#include <cctype>
+#include <iostream>
+#include <vector>
+
+#define SPIKE_GENERATION_NUMBER_OF_KERNELS 1
+#define SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS 7
+
+class SpikeGeneration2DManyIP
+{
+    public:
+    SpikeGeneration2DManyIP();
+    ~SpikeGeneration2DManyIP();
+
+    bool spike_generation_entrypoint(
+        int64_t input_pointer_addr, int64_t input_dim_0,
+        int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
+        int64_t random_values_pointer_addr, int64_t random_values_dim_0,
+        int64_t random_values_dim_1, int64_t random_values_dim_2,
+        int64_t random_values_dim_3, int64_t output_pointer_addr,
+        int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
+        int64_t output_dim_3, int64_t number_of_cpu_processes);
+
+    bool spike_generation(
+        float* input_pointer,
+        size_t input_dim_c0,
+        size_t input_dim_c1,
+        size_t input_dim_c2,
+        float* random_values_pointer,
+        size_t random_values_dim_c0,
+        size_t random_values_dim_c1,
+        size_t random_values_dim_c2,
+        int64_t* output_pointer,
+        size_t output_dim_c0,
+        size_t output_dim_c1,
+        size_t output_dim_c2,
+        size_t x_dim,
+        size_t y_dim,
+        size_t spike_dim,
+        size_t h_dim,
+        size_t pattern_id);
+
+    bool gpu_spike_generation(
+        float* input_pointer,
+        size_t input_dim_c0,
+        size_t input_dim_c1,
+        size_t input_dim_c2,
+        float* random_values_pointer,
+        size_t random_values_dim_c0,
+        size_t random_values_dim_c1,
+        size_t random_values_dim_c2,
+        int64_t* output_pointer,
+        size_t output_dim_c0,
+        size_t output_dim_c1,
+        size_t output_dim_c2,
+        size_t x_dim,
+        size_t y_dim,
+        size_t spike_dim,
+        size_t h_dim,
+        size_t number_of_pattern);
+
+    void gpu_occupancy_export(size_t dim_x, size_t dim_y, size_t number_of_pattern,
+        size_t spike_dim, int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1);
+
+    void gpu_occupancy_import(
+        int64_t setting_memory_addr,
+        size_t setting_dim_0,
+        size_t setting_dim_1);
+
+    private:
+    size_t lower_bound(float* data_ptr, size_t data_length,
+        size_t data_ptr_stride,
+        float compare_to_value);
+
+    void gpu_occupancy_measure(size_t dim_x, size_t dim_y, size_t number_of_pattern,
+        size_t spike_dim);
+
+    bool grid_and_thread_calculated = false;
+    std::vector<std::vector<size_t>> grid_and_thread_settings;
+    bool display_debug = false;
+};
+
+#endif /* SPIKEGENERATION2DMANYIP */
--- a/network/CPP_Cuda_new_preview/TestKernel.cu
+++ b/network/CPP_Cuda_new_preview/TestKernel.cu
@ -0,0 +1,329 @@
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+#include "TestKernel.h"
+#include "kernel_phxy_fill_with_h.h"
+#include "kernel_phxy_fill_with_spike_selected_w.h"
+#include "kernel_phxy_one_over_sum_into_pxy.h"
+#include "kernel_phxy_plus_phxy.h"
+#include "kernel_phxy_plus_pxy.h"
+#include "kernel_phxy_times_phxy_equals_phxy.h"
+#include "kernel_phxy_times_pxy.h"
+#include "kernel_pxy_plus_v.h"
+#include "kernel_pxy_reciprocal.h"
+#include "kernel_pxy_set_to_v.h"
+#include "kernel_pxy_time_pxy.h"
+#include "kernel_pxy_times_spike_selected_sxy.h"
+#include "kernel_pxy_times_v.h"
+
+TestKernel::TestKernel(){
+
+};
+
+TestKernel::~TestKernel(){
+
+};
+
+void TestKernel::test_kernel_pxy_times_spike_selected_sxy(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, int64_t pxy_memory_addr, int64_t sxy_memory_addr,
+    int64_t spike_memory_addr, size_t spike_time, size_t spike_dim_c0,
+    size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0,
+    size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1) {
+  float* pxy_memory = (float*)pxy_memory_addr;
+  float* sxy_memory = (float*)sxy_memory_addr;
+  int64_t* spike_memory = (int64_t*)spike_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_pxy_times_spike_selected_sxy(dim_x, dim_y, number_of_pattern,
+                                                h_dim, setting, display_debug);
+
+  size_t pxy_block_dim_c0 = dim_x * dim_y;
+  size_t pxy_block_dim_c1 = dim_y;
+
+  kernel_pxy_times_spike_selected_sxy<<<
+      dim3(setting[0], setting[1], setting[2]),
+      dim3(setting[3], setting[4], setting[5])>>>(
+      pxy_memory, sxy_memory, spike_memory, spike_time, spike_dim_c0,
+      spike_dim_c1, spike_dim_c2, pxy_dim_c0, pxy_dim_c1, sxy_dim_c0,
+      sxy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y,
+                                            size_t number_of_pattern,
+                                            size_t h_dim, bool display_debug,
+                                            int64_t phxy_memory_a_addr,
+                                            int64_t phxy_memory_b_addr) {
+  float* phxy_memory_a = (float*)phxy_memory_a_addr;
+  float* phxy_memory_b = (float*)phxy_memory_b_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_plus_phxy(dim_x, dim_y, number_of_pattern, h_dim,
+                                  setting, display_debug);
+
+  kernel_phxy_plus_phxy<<<dim3(setting[0], setting[1], setting[2]),
+                          dim3(setting[3], setting[4], setting[5])>>>(
+      phxy_memory_a, phxy_memory_b, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_pxy_times_v(size_t dim_x, size_t dim_y,
+                                         size_t number_of_pattern, size_t h_dim,
+                                         bool display_debug, float value,
+                                         int64_t pxy_memory_addr) {
+  float* pxy_memory = (float*)pxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_pxy_times_v(dim_x, dim_y, number_of_pattern, h_dim, setting,
+                               display_debug);
+
+  kernel_pxy_times_v<<<dim3(setting[0], setting[1], setting[2]),
+                       dim3(setting[3], setting[4], setting[5])>>>(
+      pxy_memory, value, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_fill_with_spike_selected_w(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, size_t spike_time, size_t weights_dim_c0,
+    size_t spike_dim_c0, size_t spike_dim_c1, size_t spike_dim_c2,
+    size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2,
+    int64_t phxy_memory_addr, int64_t weight_memory_addr,
+    int64_t spike_memory_addr) {
+  float* phxy_memory = (float*)phxy_memory_addr;
+  float* weight_memory = (float*)weight_memory_addr;
+  int64_t* spike_memory = (int64_t*)spike_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_fill_with_spike_selected_w(
+      dim_x, dim_y, number_of_pattern, h_dim, setting, display_debug);
+
+  size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y;
+  size_t phxy_block_dim_c1 = dim_x * dim_y;
+  size_t phxy_block_dim_c2 = dim_y;
+
+  kernel_phxy_fill_with_spike_selected_w<<<
+      dim3(setting[0], setting[1], setting[2]),
+      dim3(setting[3], setting[4], setting[5])>>>(
+      phxy_memory, weight_memory, spike_memory, spike_time, weights_dim_c0,
+      spike_dim_c0, spike_dim_c1, spike_dim_c2, phxy_dim_c0, phxy_dim_c1,
+      phxy_dim_c2, h_dim, phxy_block_dim_c0, phxy_block_dim_c1,
+      phxy_block_dim_c2, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_times_phxy_equals_phxy(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, int64_t phxy_memory_a_addr, int64_t phxy_memory_b_addr,
+    int64_t phxy_memory_out_addr) {
+  float* phxy_memory_a = (float*)phxy_memory_a_addr;
+  float* phxy_memory_b = (float*)phxy_memory_b_addr;
+  float* phxy_memory_out = (float*)phxy_memory_out_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_times_phxy_equals_phxy(dim_x, dim_y, number_of_pattern,
+                                               h_dim, setting, display_debug);
+
+  kernel_phxy_times_phxy_equals_phxy<<<dim3(setting[0], setting[1], setting[2]),
+                                       dim3(setting[3], setting[4],
+                                            setting[5])>>>(
+      phxy_memory_a, phxy_memory_b, phxy_memory_out, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_pxy_plus_v(size_t dim_x, size_t dim_y,
+                                        size_t number_of_pattern, size_t h_dim,
+                                        bool display_debug, float value,
+                                        int64_t pxy_memory_addr) {
+  float* pxy_memory = (float*)pxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_pxy_plus_v(dim_x, dim_y, number_of_pattern, h_dim, setting,
+                              display_debug);
+
+  kernel_pxy_plus_v<<<dim3(setting[0], setting[1], setting[2]),
+                      dim3(setting[3], setting[4], setting[5])>>>(
+      pxy_memory, value, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y,
+                                          size_t number_of_pattern,
+                                          size_t h_dim, bool display_debug,
+                                          int64_t pxy_memory_a_addr,
+                                          int64_t pxy_memory_b_addr) {
+  float* pxy_memory_a = (float*)pxy_memory_a_addr;
+  float* pxy_memory_b = (float*)pxy_memory_b_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_pxy_time_pxy(dim_x, dim_y, number_of_pattern, h_dim, setting,
+                                display_debug);
+
+  kernel_pxy_time_pxy<<<dim3(setting[0], setting[1], setting[2]),
+                        dim3(setting[3], setting[4], setting[5])>>>(
+      pxy_memory_a, pxy_memory_b, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_plus_pxy(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1,
+    int64_t phxy_memory_addr, int64_t pxy_memory_addr) {
+  float* phxy_memory = (float*)phxy_memory_addr;
+  float* pxy_memory = (float*)pxy_memory_addr;
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_plus_pxy(dim_x, dim_y, number_of_pattern, h_dim,
+                                 setting, display_debug);
+
+  size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y;
+  size_t phxy_block_dim_c1 = dim_x * dim_y;
+  size_t phxy_block_dim_c2 = dim_y;
+
+  kernel_phxy_plus_pxy<<<dim3(setting[0], setting[1], setting[2]),
+                         dim3(setting[3], setting[4], setting[5])>>>(
+      phxy_memory, pxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim,
+      pxy_dim_c0, pxy_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1,
+      phxy_block_dim_c2, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_one_over_sum_into_pxy(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1,
+    int64_t phxy_memory_addr, int64_t pxy_memory_addr) {
+  float* phxy_memory = (float*)phxy_memory_addr;
+  float* pxy_memory = (float*)pxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_one_over_sum_into_pxy(dim_x, dim_y, number_of_pattern,
+                                              h_dim, setting, display_debug);
+  size_t pxy_block_dim_c0 = dim_x * dim_y;
+  size_t pxy_block_dim_c1 = dim_y;
+
+  kernel_phxy_one_over_sum_into_pxy<<<dim3(setting[0], setting[1], setting[2]),
+                                      dim3(setting[3], setting[4],
+                                           setting[5])>>>(
+      phxy_memory, pxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim,
+      pxy_dim_c0, pxy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+                                            size_t number_of_pattern,
+                                            size_t h_dim, bool display_debug,
+                                            int64_t pxy_memory_addr) {
+  float* pxy_memory = (float*)pxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_pxy_reciprocal(dim_x, dim_y, number_of_pattern, h_dim,
+                                  setting, display_debug);
+  kernel_pxy_reciprocal<<<dim3(setting[0], setting[1], setting[2]),
+                          dim3(setting[3], setting[4], setting[5])>>>(
+      pxy_memory, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_fill_with_h(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    size_t phxy_dim_c2, int64_t h_memory_addr, int64_t phxy_memory_addr) {
+  float* h_memory = (float*)h_memory_addr;
+  float* phxy_memory = (float*)phxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_fill_with_h(dim_x, dim_y, number_of_pattern, h_dim,
+                                    setting, display_debug);
+
+  size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y;
+  size_t phxy_block_dim_c1 = dim_x * dim_y;
+  size_t phxy_block_dim_c2 = dim_y;
+
+  kernel_phxy_fill_with_h<<<dim3(setting[0], setting[1], setting[2]),
+                            dim3(setting[3], setting[4], setting[5])>>>(
+      h_memory, phxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim,
+      phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_phxy_times_pxy(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1,
+    int64_t phxy_memory_addr, int64_t pxy_memory_addr) {
+  float* phxy_memory = (float*)phxy_memory_addr;
+  float* pxy_memory = (float*)pxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_phxy_times_pxy(dim_x, dim_y, number_of_pattern, h_dim,
+                                  setting, display_debug);
+
+  size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y;
+  size_t phxy_block_dim_c1 = dim_x * dim_y;
+  size_t phxy_block_dim_c2 = dim_y;
+
+  kernel_phxy_times_pxy<<<dim3(setting[0], setting[1], setting[2]),
+                          dim3(setting[3], setting[4], setting[5])>>>(
+      phxy_memory, pxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim,
+      pxy_dim_c0, pxy_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1,
+      phxy_block_dim_c2, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
+
+void TestKernel::test_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y,
+                                          size_t number_of_pattern,
+                                          size_t h_dim, bool display_debug,
+                                          float set_value,
+                                          int64_t pxy_memory_addr) {
+  float* pxy_memory = (float*)pxy_memory_addr;
+
+  std::vector<size_t> setting;
+  occupancy_kernel_pxy_set_to_v(dim_x, dim_y, number_of_pattern, h_dim, setting,
+                                display_debug);
+  kernel_pxy_set_to_v<<<dim3(setting[0], setting[1], setting[2]),
+                        dim3(setting[3], setting[4], setting[5])>>>(
+      pxy_memory, set_value, setting[6]);
+
+  cudaError_t status;
+  status = cudaDeviceSynchronize();
+  assert((status == cudaSuccess));
+};
--- a/network/CPP_Cuda_new_preview/TestKernel.h
+++ b/network/CPP_Cuda_new_preview/TestKernel.h
@ -0,0 +1,89 @@
+class TestKernel {
+ public:
+  TestKernel();
+  ~TestKernel();
+
+  void test_kernel_pxy_times_spike_selected_sxy(
+      size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+      bool display_debug, int64_t pxy_memory_addr, int64_t sxy_memory_addr,
+      int64_t spike_memory_addr, size_t spike_time, size_t spike_dim_c0,
+      size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0,
+      size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1);
+
+  // --- phxy
+
+  void test_kernel_phxy_fill_with_spike_selected_w(
+      size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+      bool display_debug, size_t spike_time, size_t weights_dim_c0,
+      size_t spike_dim_c0, size_t spike_dim_c1, size_t spike_dim_c2,
+      size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2,
+      int64_t phxy_memory_addr, int64_t weight_memory_addr,
+      int64_t spike_memory_addr);
+
+  void test_kernel_phxy_one_over_sum_into_pxy(
+      size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+      bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1,
+      size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1,
+      int64_t phxy_memory_addr, int64_t pxy_memory_addr);
+
+  void test_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y,
+                                    size_t number_of_pattern, size_t h_dim,
+                                    bool display_debug, size_t phxy_dim_c0,
+                                    size_t phxy_dim_c1, size_t phxy_dim_c2,
+                                    int64_t h_memory_addr,
+                                    int64_t phxy_memory_addr);
+
+  void test_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y,
+                                 size_t number_of_pattern, size_t h_dim,
+                                 bool display_debug, size_t phxy_dim_c0,
+                                 size_t phxy_dim_c1, size_t phxy_dim_c2,
+                                 size_t pxy_dim_c0, size_t pxy_dim_c1,
+                                 int64_t phxy_memory_addr,
+                                 int64_t pxy_memory_addr);
+
+  void test_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y,
+                                  size_t number_of_pattern, size_t h_dim,
+                                  bool display_debug, size_t phxy_dim_c0,
+                                  size_t phxy_dim_c1, size_t phxy_dim_c2,
+                                  size_t pxy_dim_c0, size_t pxy_dim_c1,
+                                  int64_t phxy_memory_addr,
+                                  int64_t pxy_memory_addr);
+
+  void test_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y,
+                                               size_t number_of_pattern,
+                                               size_t h_dim, bool display_debug,
+                                               int64_t phxy_memory_a_addr,
+                                               int64_t phxy_memory_b_addr,
+                                               int64_t phxy_memory_out_addr);
+
+  void test_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y,
+                                  size_t number_of_pattern, size_t h_dim,
+                                  bool display_debug,
+                                  int64_t phxy_memory_a_addr,
+                                  int64_t phxy_memory_b_addr);
+
+  // --- pxy
+  void test_kernel_pxy_plus_v(size_t dim_x, size_t dim_y,
+                              size_t number_of_pattern, size_t h_dim,
+                              bool display_debug, float value,
+                              int64_t pxy_memory_addr);
+
+  void test_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y,
+                                size_t number_of_pattern, size_t h_dim,
+                                bool display_debug, int64_t pxy_memory_a_addr,
+                                int64_t pxy_memory_b_addr);
+
+  void test_kernel_pxy_times_v(size_t dim_x, size_t dim_y,
+                               size_t number_of_pattern, size_t h_dim,
+                               bool display_debug, float value,
+                               int64_t pxy_memory_addr);
+
+  void test_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+                                  size_t number_of_pattern, size_t h_dim,
+                                  bool display_debug, int64_t pxy_memory_addr);
+
+  void test_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y,
+                                size_t number_of_pattern, size_t h_dim,
+                                bool display_debug, float value,
+                                int64_t pxy_memory_addr);
+};
--- a/network/CPP_Cuda_new_preview/approximation_multiplication_function.cpp
+++ b/network/CPP_Cuda_new_preview/approximation_multiplication_function.cpp
@ -0,0 +1,165 @@
+#include <unistd.h>
+
+#include <bitset>
+#include <cassert>
+#include <cctype>
+
+#include "approximation_multiplication_function.h"
+#include "error_term.h"
+
+// Best way to plot the bits
+// std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n";
+
+// The result needs to be written back into h_pointer (which contains h)
+// Don't write to w_pointer.
+void approximation_multiplication_function(
+  float* h_pointer, float* w_pointer, int64_t pattern_length,
+  uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits,
+  uint32_t* ap_x_ptr, uint32_t* ap_y_ptr, uint32_t* ap_x_exponent_ptr,
+  uint32_t* ap_y_exponent_ptr, uint32_t* ap_h_exponent_ptr, uint32_t ap_mask,
+  uint64_t* ap_res_ptr, uint32_t* sign_temp_ptr, bool approximation_enable)
+{
+  uint64_t counter;
+
+  uint32_t* w_pointer_mod = (uint32_t*)w_pointer;
+  uint32_t* h_pointer_mod = (uint32_t*)h_pointer;
+
+  //  Calculate the new sign
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^
+      (h_pointer_mod[counter] & 0x80000000);
+  }
+
+  // Extract the exponent
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
+  }
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24;
+  }
+
+  // Cast and "normalize"
+  uint64_t shift_value = 32 - number_of_frac_bits;
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_x_ptr[counter] =
+      ((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
+  }
+
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_y_ptr[counter] =
+      ((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
+  }
+
+  // Make the zero -g-r-e-a-t- correct again
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    if (h_pointer[counter] == 0)
+    {
+      ap_x_ptr[counter] = 0;
+    }
+  }
+
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    if (w_pointer[counter] == 0)
+    {
+      ap_y_ptr[counter] = 0;
+    }
+  }
+
+  // res = x*y
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_res_ptr[counter] = static_cast<uint64_t>(ap_x_ptr[counter]) * static_cast<uint64_t>(ap_y_ptr[counter]);
+  }
+
+  uint32_t temp;
+  if (approximation_enable == true)
+  {
+    // Go through the vector values
+    for (counter = 0; counter < pattern_length; counter++)
+    {
+      temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask,
+        number_of_trunc_bits);
+      if (temp > ap_res_ptr[counter])
+      {
+        ap_res_ptr[counter] = 0;
+      }
+      else
+      {
+        ap_res_ptr[counter] -= temp;
+      }
+    }
+  }
+  // Cast from int to float
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    h_pointer[counter] = static_cast<float>(ap_res_ptr[counter]);
+  }
+
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
+  }
+
+  // devide by the 2^number_of_frac_bits
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits;
+  }
+
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] +
+      ap_h_exponent_ptr[counter];
+    if (temp > 252)
+    {
+      ap_h_exponent_ptr[counter] = temp - 252;
+    }
+    else
+    {
+      // Here I try to catch the case that the new exponent is too small
+      ap_h_exponent_ptr[counter] = 0;
+    }
+  }
+
+  // Remove the old exponent
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9;
+  }
+
+  // Install the new exponent
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23;
+  }
+
+  // Add the sign back
+#pragma omp simd
+  for (counter = 0; counter < pattern_length; counter++)
+  {
+    h_pointer_mod[counter] += sign_temp_ptr[counter];
+  }
+
+  return;
+}
--- a/network/CPP_Cuda_new_preview/approximation_multiplication_function.h
+++ b/network/CPP_Cuda_new_preview/approximation_multiplication_function.h
@ -0,0 +1,11 @@
+#ifndef APPROXIMATION_MULTIPLICATION_FUNCTION
+#define APPROXIMATION_MULTIPLICATION_FUNCTION
+
+void approximation_multiplication_function(
+    float* h_pointer, float* w_pointer, int64_t pattern_length,
+    uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits,
+    uint32_t* ap_x_ptr, uint32_t* ap_y_ptr, uint32_t* ap_x_exponent_ptr,
+    uint32_t* ap_y_exponent_ptr, uint32_t* ap_h_exponent_ptr, uint32_t ap_mask,
+    uint64_t* ap_res_ptr, uint32_t* sign_temp_ptr, bool approximation_enable);
+
+#endif /* APPROXIMATION_MULTIPLICATION_FUNCTION */
--- a/network/CPP_Cuda_new_preview/error_term.cpp
+++ b/network/CPP_Cuda_new_preview/error_term.cpp
@ -0,0 +1,34 @@
+#include <unistd.h>
+
+#include <cassert>
+#include <cctype>
+#include <iostream>
+
+#include "error_term.h"
+
+uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask,
+  uint32_t number_of_trunc_bits)
+{
+  uint32_t error_value = 0;
+
+  uint32_t temp_shift_a = a;
+  uint32_t temp_shift_b = b & ap_mask;
+
+  uint32_t counter_trunc;
+  uint32_t temp;
+
+  // Go through the bits
+  for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
+    counter_trunc++)
+  {
+    temp = temp_shift_a & 1;
+    if (temp == 1)
+    {
+      error_value += temp_shift_b & ap_mask;
+    }
+    temp_shift_a >>= 1;
+    temp_shift_b <<= 1;
+  }
+
+  return error_value;
+};
--- a/network/CPP_Cuda_new_preview/error_term.h
+++ b/network/CPP_Cuda_new_preview/error_term.h
@ -0,0 +1,7 @@
+#ifndef ERROR_TERM
+#define ERROR_TERM
+
+uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask,
+    uint32_t number_of_trunc_bits);
+
+#endif /* ERROR_TERM */
--- a/network/CPP_Cuda_new_preview/kernel_approximation_error_term.cu
+++ b/network/CPP_Cuda_new_preview/kernel_approximation_error_term.cu
@ -0,0 +1,31 @@
+#include <cassert>
+#include <iostream>
+
+__device__ uint32_t gpu_error_term(uint32_t ap_weight_mantissa,
+    uint32_t ap_input_mantissa,
+    uint32_t ap_mask,
+    uint32_t number_of_trunc_bits)
+{
+    uint32_t error_value = 0;
+
+    uint32_t temp_shift_a = ap_weight_mantissa;
+    uint32_t temp_shift_b = ap_input_mantissa & ap_mask;
+
+    uint32_t counter_trunc;
+    uint32_t temp;
+
+    // Go through the bits
+    for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
+        counter_trunc++)
+    {
+        temp = temp_shift_a & 1;
+        if (temp == 1)
+        {
+            error_value += temp_shift_b & ap_mask;
+        }
+        temp_shift_a >>= 1;
+        temp_shift_b <<= 1;
+    }
+
+    return error_value;
+}
--- a/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.cu
+++ b/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.cu
@ -0,0 +1,219 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_approximation_error_term.cu"
+#include "kernel_approximation_multiplication.h"
+#include "kernel_helper_functions.h"
+
+// Includes accumulation too...
+__global__ void kernel_approximation_multiplication(
+    float* __restrict__ input_pointer, float* __restrict__ weight_pointer,
+    float* __restrict__ output_pointer, uint64_t pattern_dim,
+    uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
+    uint64_t input_channel_dim, size_t max_threadable_tasks,
+    uint64_t input_index_scale, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask,
+    size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_threadable_tasks) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t feature_id = idx / block_dim_c1;
+    idx -= feature_id * block_dim_c1;
+    size_t position_x = idx / block_dim_c2;
+    idx -= position_x * block_dim_c2;
+    size_t position_y = idx;
+
+    float* weight_pointer_sub = weight_pointer + feature_id * input_channel_dim;
+    float* input_pointer_sub = input_pointer +
+                               pattern_id * input_channel_dim * x_dim * y_dim +
+                               position_x * y_dim + position_y;
+    float* output_pointer_sub =
+        output_pointer + pattern_id * feature_dim * x_dim * y_dim +
+        feature_id * x_dim * y_dim + position_x * y_dim + position_y;
+    *output_pointer_sub = 0.0;
+
+    size_t counter;
+    for (counter = 0; counter < input_channel_dim; counter++) {
+      *output_pointer_sub += gpu_approximation_multiplication_function(
+          weight_pointer_sub[counter],
+          input_pointer_sub[counter * input_index_scale], number_of_frac_bits,
+          approximation_enable, number_of_trunc_bits, ap_mask);
+    }
+  }
+};
+
+// Only x = a*b
+__global__ void kernel_approximation_pure_multiplication(
+    float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b,
+    float* __restrict__ phxy_memory_out, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask,
+    size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    phxy_memory_out[idx] = gpu_approximation_multiplication_function(
+        phxy_memory_a[idx], phxy_memory_b[idx], number_of_frac_bits,
+        approximation_enable, number_of_trunc_bits, ap_mask);
+  }
+};
+
+__device__ float gpu_approximation_multiplication_function(
+    float weight, float input, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits,
+    uint32_t ap_mask) {
+  float weight_copy = weight;
+  float input_copy = input;
+
+  uint32_t* weight_pointer_mod = (uint32_t*)&weight_copy;
+  uint32_t* input_pointer_mod = (uint32_t*)&input_copy;
+
+  //  Calculate the new sign
+  uint32_t sign_temp =
+      (*weight_pointer_mod & 0x80000000) ^ (*input_pointer_mod & 0x80000000);
+
+  // Extract the exponent
+  uint32_t ap_input_exponent = (*input_pointer_mod << 1) >> 24;
+  uint32_t ap_weight_exponent = (*weight_pointer_mod << 1) >> 24;
+
+  // Cast and "normalize"
+  uint64_t shift_value = 32 - number_of_frac_bits;
+
+  uint32_t ap_input_mantissa =
+      ((*input_pointer_mod << 8) | 0x80000000) >> shift_value;
+
+  uint32_t ap_weight_mantissa =
+      ((*weight_pointer_mod << 8) | 0x80000000) >> shift_value;
+
+  // Make the zero -g-r-e-a-t- correct again
+  if (input == 0) {
+    ap_input_mantissa = 0;
+  }
+
+  if (weight == 0) {
+    ap_weight_mantissa = 0;
+  }
+
+  // res = x*y
+  uint64_t ap_result = static_cast<uint64_t>(ap_input_mantissa) *
+                       static_cast<uint64_t>(ap_weight_mantissa);
+
+  uint32_t temp;
+  // --------------------------------------------
+  // Approx
+  // --------------------------------------------
+
+  if (approximation_enable == true) {
+    // Go through the vector values
+    temp = gpu_error_term(ap_weight_mantissa, ap_input_mantissa, ap_mask,
+                          number_of_trunc_bits);
+    if (temp > ap_result) {
+      ap_result = 0;
+    } else {
+      ap_result -= temp;
+    }
+  }
+
+  // Cast from int to float
+  float output = static_cast<float>(ap_result);
+  if (ap_result == 0) {
+    output = 0.0;
+  } else {
+    uint32_t* output_pointer_mod = (uint32_t*)&output;
+
+    uint32_t ap_output_exponent = (*output_pointer_mod << 1) >> 24;
+    ap_output_exponent -= 2 * number_of_frac_bits;
+    temp = ap_input_exponent + ap_weight_exponent + ap_output_exponent;
+    if (temp > 252) {
+      ap_output_exponent = temp - 252;
+    } else {
+      // Here I try to catch the case that the new exponent is too small
+      ap_output_exponent = 0;
+    }
+
+    // Remove the old exponent
+    *output_pointer_mod = (*output_pointer_mod << 9) >> 9;
+
+    // Install the new exponent
+    *output_pointer_mod += ap_output_exponent << 23;
+
+    // Add the sign back
+    *output_pointer_mod += sign_temp;
+  }
+  return output;
+};
+
+void occupancy_kernel_approximation_multiplication(size_t dim_x, size_t dim_y,
+                                                   size_t number_of_pattern,
+                                                   size_t h_dim,
+                                                   std::vector<size_t>& output,
+                                                   bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size,
+      (void*)kernel_approximation_multiplication, 0, max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_approximation_multiplication:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
+
+// ----------------------------------------------------------------
+
+void occupancy_kernel_approximation_pure_multiplication(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    std::vector<size_t>& output, bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size,
+      (void*)kernel_approximation_pure_multiplication, 0, max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_approximation_multiplication:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.h
+++ b/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.h
@ -0,0 +1,35 @@
+#ifndef KERNEL_APPROXIMATION_MULTIPLICATION
+#define KERNEL_APPROXIMATION_MULTIPLICATION
+
+#include <vector>
+
+__global__ void kernel_approximation_multiplication(
+    float* __restrict__ input_pointer, float* __restrict__ weight_pointer,
+    float* __restrict__ output_pointer, uint64_t pattern_dim,
+    uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
+    uint64_t input_channel_dim, size_t max_threadable_tasks,
+    uint64_t input_index_scale, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask,
+    size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2);
+
+__global__ void kernel_approximation_pure_multiplication(
+    float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b,
+    float* __restrict__ phxy_memory_out, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask,
+    size_t max_idx);
+
+__device__ float gpu_approximation_multiplication_function(
+    float weight, float input, uint64_t number_of_frac_bits,
+    bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask);
+
+void occupancy_kernel_approximation_multiplication(size_t dim_x, size_t dim_y,
+                                                   size_t number_of_pattern,
+                                                   size_t h_dim,
+                                                   std::vector<size_t>& output,
+                                                   bool display_debug);
+
+void occupancy_kernel_approximation_pure_multiplication(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    std::vector<size_t>& output, bool display_debug);
+
+#endif /* KERNEL_APPROXIMATION_MULTIPLICATION */
--- a/network/CPP_Cuda_new_preview/kernel_helper_functions.cu
+++ b/network/CPP_Cuda_new_preview/kernel_helper_functions.cu
@ -0,0 +1,17 @@
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+
+void kernel_debug_plot(std::vector<size_t> output, bool display_debug) {
+  if (display_debug == true) {
+    std::cout << "grid x: " << output[0] << std::endl;
+    std::cout << "grid y: " << output[1] << std::endl;
+    std::cout << "grid z: " << output[2] << std::endl;
+    std::cout << "thread block x: " << output[3] << std::endl;
+    std::cout << "thread block y: " << output[4] << std::endl;
+    std::cout << "thread block z: " << output[5] << std::endl;
+    std::cout << "max_idx: " << output[6] << std::endl << std::endl;
+  }
+
+  return;
+};
--- a/network/CPP_Cuda_new_preview/kernel_helper_functions.h
+++ b/network/CPP_Cuda_new_preview/kernel_helper_functions.h
@ -0,0 +1,7 @@
+#ifndef KERNEL_HELPER_FUNCTIONS
+#define KERNEL_HELPER_FUNCTIONS
+#include <vector>
+
+void kernel_debug_plot(std::vector<size_t> output, bool display_debug);
+
+#endif /* KERNEL_HELPER_FUNCTIONS */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.cu
@ -0,0 +1,64 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_fill_with_h.h"
+
+__global__ void kernel_phxy_fill_with_h(float* __restrict__ h_memory,
+                                        float* __restrict__ phxy_memory,
+                                        size_t phxy_dim_c0, size_t phxy_dim_c1,
+                                        size_t phxy_dim_c2, size_t h_dim,
+                                        size_t block_dim_c0,
+                                        size_t block_dim_c1,
+                                        size_t block_dim_c2, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t idx_h = idx / block_dim_c1;
+    idx -= idx_h * block_dim_c1;
+    size_t position_x = idx / block_dim_c2;
+    idx -= position_x * block_dim_c2;
+    size_t position_y = idx;
+
+    phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 +
+                position_x * phxy_dim_c2 + position_y] = h_memory[idx_h];
+  }
+};
+
+void occupancy_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y,
+                                       size_t number_of_pattern, size_t h_dim,
+                                       std::vector<size_t>& output,
+                                       bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_phxy_fill_with_h, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_fill_with_h:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.h
@ -0,0 +1,18 @@
+#ifndef KERNEL_PHXY_FILL_WITH_H
+#define KERNEL_PHXY_FILL_WITH_H
+#include <vector>
+
+__global__ void kernel_phxy_fill_with_h(float* __restrict__ h_memory,
+                                        float* __restrict__ phxy_memory,
+                                        size_t phxy_dim_c0, size_t phxy_dim_c1,
+                                        size_t phxy_dim_c2, size_t h_dim,
+                                        size_t block_dim_c0,
+                                        size_t block_dim_c1,
+                                        size_t block_dim_c2, size_t max_idx);
+
+void occupancy_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y,
+                                       size_t number_of_pattern, size_t h_dim,
+                                       std::vector<size_t>& output,
+                                       bool display_debug);
+
+#endif /* KERNEL_PHXY_FILL_WITH_H */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.cu
@ -0,0 +1,73 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_fill_with_spike_selected_w.h"
+
+__global__ void kernel_phxy_fill_with_spike_selected_w(
+    float* __restrict__ phxy_memory, float* __restrict__ weights_memory,
+    int64_t* __restrict__ spike_memory, size_t spike_time,
+    size_t weights_dim_c0, size_t spike_dim_c0, size_t spike_dim_c1,
+    size_t spike_dim_c2, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    size_t phxy_dim_c2, size_t h_dim, size_t block_dim_c0, size_t block_dim_c1,
+    size_t block_dim_c2, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t idx_h = idx / block_dim_c1;
+    idx -= idx_h * block_dim_c1;
+    size_t position_x = idx / block_dim_c2;
+    idx -= position_x * block_dim_c2;
+    size_t position_y = idx;
+
+    int64_t* spike = spike_memory + pattern_id * spike_dim_c0 +
+                     spike_time * spike_dim_c1 + position_x * spike_dim_c2 +
+                     position_y;
+
+    if (*spike >= 0) {
+      phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 +
+                  position_x * phxy_dim_c2 + position_y] =
+          weights_memory[*spike * weights_dim_c0 + idx_h];
+    } else {
+      phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 +
+                  position_x * phxy_dim_c2 + position_y] = 0.0;
+    }
+  }
+};
+
+void occupancy_kernel_phxy_fill_with_spike_selected_w(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    std::vector<size_t>& output, bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size,
+      (void*)kernel_phxy_fill_with_spike_selected_w, 0, max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_fill_with_spike_selected_w:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.h
@ -0,0 +1,17 @@
+#ifndef KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W
+#define KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W
+#include <vector>
+
+__global__ void kernel_phxy_fill_with_spike_selected_w(
+    float* __restrict__ phxy_memory, float* __restrict__ weights_memory,
+    int64_t* __restrict__ spike_memory, size_t spike_time,
+    size_t weights_dim_c0, size_t spike_dim_c0, size_t spike_dim_c1,
+    size_t spike_dim_c2, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    size_t phxy_dim_c2, size_t h_dim, size_t block_dim_c0, size_t block_dim_c1,
+    size_t block_dim_c2, size_t max_idx);
+
+void occupancy_kernel_phxy_fill_with_spike_selected_w(
+    size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    std::vector<size_t>& output, bool display_debug);
+
+#endif /* KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.cu
@ -0,0 +1,74 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_one_over_sum_into_pxy.h"
+
+__global__ void kernel_phxy_one_over_sum_into_pxy(
+    float* __restrict__ phxy_memory, float* __restrict__ pxy_memory,
+    size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, size_t h_dim,
+    size_t pxy_dim_c0, size_t pxy_dim_c1, size_t block_dim_c0,
+    size_t block_dim_c1, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t position_x = idx / block_dim_c1;
+    idx -= position_x * block_dim_c1;
+    size_t position_y = idx;
+
+    size_t offset_phxy_temp =
+        pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y;
+
+    size_t offset_pxy_sum =
+        pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y;
+
+    float temp = 0.0;
+    for (size_t idx_h = 0; idx_h < h_dim; idx_h++) {
+      temp += phxy_memory[offset_phxy_temp + idx_h * phxy_dim_c1];
+    }
+    if (temp > 1E-10) {
+      pxy_memory[offset_pxy_sum] = 1.0 / temp;
+    } else {
+      pxy_memory[offset_pxy_sum] = 0.0;
+    }
+  }
+};
+
+void occupancy_kernel_phxy_one_over_sum_into_pxy(size_t dim_x, size_t dim_y,
+                                                 size_t number_of_pattern,
+                                                 size_t h_dim,
+                                                 std::vector<size_t>& output,
+                                                 bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size,
+      (void*)kernel_phxy_one_over_sum_into_pxy, 0, max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_one_over_sum_into_pxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.h
@ -0,0 +1,17 @@
+#ifndef KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY
+#define KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY
+#include <vector>
+
+__global__ void kernel_phxy_one_over_sum_into_pxy(
+    float* __restrict__ phxy_memory, float* __restrict__ pxy_memory,
+    size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, size_t h_dim,
+    size_t pxy_dim_c0, size_t pxy_dim_c1, size_t block_dim_c0,
+    size_t block_dim_c1, size_t max_idx);
+
+void occupancy_kernel_phxy_one_over_sum_into_pxy(size_t dim_x, size_t dim_y,
+                                                 size_t number_of_pattern,
+                                                 size_t h_dim,
+                                                 std::vector<size_t>& output,
+                                                 bool display_debug);
+
+#endif /* KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.cu
@ -0,0 +1,51 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_plus_phxy.h"
+
+__global__ void kernel_phxy_plus_phxy(float* __restrict__ phxy_memory_a,
+                                      float* __restrict__ phxy_memory_b,
+                                      size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    phxy_memory_a[idx] += phxy_memory_b[idx];
+  }
+};
+
+void occupancy_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_phxy_plus_phxy, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_plus_phxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.h
@ -0,0 +1,12 @@
+#ifndef KERNEL_PHXY_PLUS_PHXY
+#define KERNEL_PHXY_PLUS_PHXY
+#include <vector>
+__global__ void kernel_phxy_plus_phxy(float* __restrict__ phxy_memory_a,
+                                      float* __restrict__ phxy_memory_b,
+                                      size_t max_idx);
+
+void occupancy_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug);
+#endif /* KERNEL_PHXY_PLUS_PHXY */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.cu
@ -0,0 +1,70 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_plus_pxy.h"
+
+__global__ void kernel_phxy_plus_pxy(float* __restrict__ phxy_memory,
+                                     float* __restrict__ pxy_memory,
+                                     size_t phxy_dim_c0, size_t phxy_dim_c1,
+                                     size_t phxy_dim_c2, size_t h_dim,
+                                     size_t pxy_dim_c0, size_t pxy_dim_c1,
+                                     size_t block_dim_c0, size_t block_dim_c1,
+                                     size_t block_dim_c2, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t idx_h = idx / block_dim_c1;
+    idx -= idx_h * block_dim_c1;
+    size_t position_x = idx / block_dim_c2;
+    idx -= position_x * block_dim_c2;
+    size_t position_y = idx;
+
+    size_t offset_h_temp =
+        pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y;
+
+    size_t offset_h_sum =
+        pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y;
+
+    phxy_memory[offset_h_temp + idx_h * phxy_dim_c1] +=
+        pxy_memory[offset_h_sum];
+  }
+};
+
+void occupancy_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y,
+                                    size_t number_of_pattern, size_t h_dim,
+                                    std::vector<size_t>& output,
+                                    bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_phxy_plus_pxy, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_plus_pxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.h
@ -0,0 +1,16 @@
+#ifndef KERNEL_PHXY_PLUS_PXY
+#define KERNEL_PHXY_PLUS_PXY
+#include <vector>
+__global__ void kernel_phxy_plus_pxy(float* __restrict__ phxy_memory,
+                                     float* __restrict__ pxy_memory,
+                                     size_t phxy_dim_c0, size_t phxy_dim_c1,
+                                     size_t phxy_dim_c2, size_t h_dim,
+                                     size_t pxy_dim_c0, size_t pxy_dim_c1,
+                                     size_t block_dim_c0, size_t block_dim_c1,
+                                     size_t block_dim_c2, size_t max_idx);
+
+void occupancy_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y,
+                                    size_t number_of_pattern, size_t h_dim,
+                                    std::vector<size_t>& output,
+                                    bool display_debug);
+#endif /* KERNEL_PHXY_PLUS_PXY */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.cu
@ -0,0 +1,53 @@
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_times_phxy_equals_phxy.h"
+
+__global__ void kernel_phxy_times_phxy_equals_phxy(
+    float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b,
+    float* __restrict__ phxy_memory_out, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    phxy_memory_out[idx] = phxy_memory_a[idx] * phxy_memory_b[idx];
+  }
+};
+
+void occupancy_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y,
+                                                  size_t number_of_pattern,
+                                                  size_t h_dim,
+                                                  std::vector<size_t>& output,
+                                                  bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size,
+      (void*)kernel_phxy_times_phxy_equals_phxy, 0, max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_times_phxy_equals_phxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.h
@ -0,0 +1,15 @@
+#ifndef KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY
+#define KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY
+#include <vector>
+
+__global__ void kernel_phxy_times_phxy_equals_phxy(
+    float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b,
+    float* __restrict__ phxy_memory_out, size_t max_idx);
+
+void occupancy_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y,
+                                                  size_t number_of_pattern,
+                                                  size_t h_dim,
+                                                  std::vector<size_t>& output,
+                                                  bool display_debug);
+
+#endif /* KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY */
--- a/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.cu
@ -0,0 +1,70 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_phxy_times_pxy.h"
+
+__global__ void kernel_phxy_times_pxy(float* __restrict__ phxy_memory,
+                                      float* __restrict__ pxy_memory,
+                                      size_t phxy_dim_c0, size_t phxy_dim_c1,
+                                      size_t phxy_dim_c2, size_t h_dim,
+                                      size_t pxy_dim_c0, size_t pxy_dim_c1,
+                                      size_t block_dim_c0, size_t block_dim_c1,
+                                      size_t block_dim_c2, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t idx_h = idx / block_dim_c1;
+    idx -= idx_h * block_dim_c1;
+    size_t position_x = idx / block_dim_c2;
+    idx -= position_x * block_dim_c2;
+    size_t position_y = idx;
+
+    size_t offset_h_temp =
+        pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y;
+
+    size_t offset_h_sum =
+        pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y;
+
+    phxy_memory[offset_h_temp + idx_h * phxy_dim_c1] *=
+        pxy_memory[offset_h_sum];
+  }
+};
+
+void occupancy_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_phxy_times_pxy, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_phxy_times_pxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.h
@ -0,0 +1,17 @@
+#ifndef KERNEL_PHXY_TIMES_PXY
+#define KERNEL_PHXY_TIMES_PXY
+#include <vector>
+
+__global__ void kernel_phxy_times_pxy(float* __restrict__ phxy_memory,
+                                      float* __restrict__ pxy_memory,
+                                      size_t phxy_dim_c0, size_t phxy_dim_c1,
+                                      size_t phxy_dim_c2, size_t h_dim,
+                                      size_t pxy_dim_c0, size_t pxy_dim_c1,
+                                      size_t block_dim_c0, size_t block_dim_c1,
+                                      size_t block_dim_c2, size_t max_idx);
+
+void occupancy_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug);
+#endif /* KERNEL_PHXY_TIMES_PXY */
--- a/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.cu
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.cu
@ -0,0 +1,50 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_plus_v.h"
+
+__global__ void kernel_pxy_plus_v(float* __restrict__ pxy_memory, float value,
+                                  size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    pxy_memory[idx] += value;
+  }
+};
+
+void occupancy_kernel_pxy_plus_v(size_t dim_x, size_t dim_y,
+                                 size_t number_of_pattern, size_t h_dim,
+                                 std::vector<size_t>& output,
+                                 bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_pxy_plus_v, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_plus_v:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.h
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.h
@ -0,0 +1,12 @@
+#ifndef KERNEL_PXY_PLUS_V
+#define KERNEL_PXY_PLUS_V
+#include <vector>
+
+__global__ void kernel_pxy_plus_v(float* __restrict__ pxy_memory, float value,
+                                  size_t max_idx);
+
+void occupancy_kernel_pxy_plus_v(size_t dim_x, size_t dim_y,
+                                 size_t number_of_pattern, size_t h_dim,
+                                 std::vector<size_t>& output,
+                                 bool display_debug);
+#endif /* KERNEL_PXY_PLUS_V */
--- a/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.cu
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.cu
@ -0,0 +1,50 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_reciprocal.h"
+
+__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
+                                      size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    pxy_memory[idx] = 1.0 / pxy_memory[idx];
+  }
+};
+
+void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_reciprocal:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.h
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.h
@ -0,0 +1,12 @@
+#ifndef KERNEL_PXY_RECIPROCAL
+#define KERNEL_PXY_RECIPROCAL
+#include <vector>
+__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
+                                      size_t max_idx);
+
+void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug);
+
+#endif /* KERNEL_PXY_RECIPROCAL */
--- a/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.cu
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.cu
@ -0,0 +1,50 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_set_to_v.h"
+
+__global__ void kernel_pxy_set_to_v(float* __restrict__ pxy_memory, float value,
+                                    size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    pxy_memory[idx] = value;
+  }
+};
+
+void occupancy_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y,
+                                   size_t number_of_pattern, size_t h_dim,
+                                   std::vector<size_t>& output,
+                                   bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_pxy_set_to_v, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_set_to_v:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.h
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.h
@ -0,0 +1,13 @@
+#ifndef KERNEL_PXY_SET_TO_V
+#define KERNEL_PXY_SET_TO_V
+#include <vector>
+
+__global__ void kernel_pxy_set_to_v(float* __restrict__ pxy_memory, float value,
+                                    size_t max_idx);
+
+void occupancy_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y,
+                                   size_t number_of_pattern, size_t h_dim,
+                                   std::vector<size_t>& output,
+                                   bool display_debug);
+
+#endif /* KERNEL_PXY_SET_TO_V */
--- a/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.cu
@ -0,0 +1,58 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_time_pxy.h"
+
+// a *= b
+__global__ void kernel_pxy_time_pxy(float* __restrict__ pxy_memory_a,
+                                    float* __restrict__ pxy_memory_b,
+                                    size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    pxy_memory_a[idx] *= pxy_memory_b[idx];
+  }
+};
+
+void occupancy_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y,
+                                   size_t number_of_pattern, size_t h_dim,
+                                   std::vector<size_t>& output,
+                                   bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_pxy_time_pxy, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  size_t gpu_tuning_factor = 5;
+
+  if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < thread_block_size)) {
+    thread_block_size = int(gpu_tuning_factor);
+  }
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_time_pxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.h
@ -0,0 +1,15 @@
+#ifndef KERNEL_PXY_TIME_PXY
+#define KERNEL_PXY_TIME_PXY
+
+#include <vector>
+
+__global__ void kernel_pxy_time_pxy(float* __restrict__ pxy_memory_a,
+                                    float* __restrict__ pxy_memory_b,
+                                    size_t max_idx);
+
+void occupancy_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y,
+                                   size_t number_of_pattern, size_t h_dim,
+                                   std::vector<size_t>& output,
+                                   bool display_debug);
+
+#endif /* KERNEL_PXY_TIME_PXY */
--- a/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.cu
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.cu
@ -0,0 +1,73 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_times_spike_selected_sxy.h"
+
+__global__ void kernel_pxy_times_spike_selected_sxy(
+    float* __restrict__ pxy_memory, float* __restrict__ sxy_memory,
+    int64_t* __restrict__ spike_memory, size_t spike_time, size_t spike_dim_c0,
+    size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0,
+    size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1,
+    size_t block_dim_c0, size_t block_dim_c1, size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t position_x = idx / block_dim_c1;
+    idx -= position_x * block_dim_c1;
+    size_t position_y = idx;
+
+    int64_t* spike = spike_memory + pattern_id * spike_dim_c0 +
+                     spike_time * spike_dim_c1 + position_x * spike_dim_c2 +
+                     position_y;
+
+    if (*spike >= 0) {
+      pxy_memory[pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 +
+                 position_y] *=
+          sxy_memory[*spike * sxy_dim_c0 + position_x * sxy_dim_c1 +
+                     position_y];
+    } else {
+      pxy_memory[pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 +
+                 position_y] = 0;
+    }
+  }
+};
+
+void occupancy_kernel_pxy_times_spike_selected_sxy(size_t dim_x, size_t dim_y,
+                                                   size_t number_of_pattern,
+                                                   size_t h_dim,
+                                                   std::vector<size_t>& output,
+                                                   bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size,
+      (void*)kernel_pxy_times_spike_selected_sxy, 0, max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_times_spike_selected_sxy:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.h
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.h
@ -0,0 +1,18 @@
+#ifndef KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY
+#define KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY
+#include <vector>
+
+__global__ void kernel_pxy_times_spike_selected_sxy(
+    float* __restrict__ pxy_memory, float* __restrict__ sxy_memory,
+    int64_t* __restrict__ spike_memory, size_t spike_time, size_t spike_dim_c0,
+    size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0,
+    size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1,
+    size_t block_dim_c0, size_t block_dim_c1, size_t max_idx);
+
+void occupancy_kernel_pxy_times_spike_selected_sxy(size_t dim_x, size_t dim_y,
+                                                   size_t number_of_pattern,
+                                                   size_t h_dim,
+                                                   std::vector<size_t>& output,
+                                                   bool display_debug);
+
+#endif /* KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY */
--- a/network/CPP_Cuda_new_preview/kernel_pxy_times_v.cu
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_v.cu
@ -0,0 +1,50 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_times_v.h"
+
+__global__ void kernel_pxy_times_v(float* __restrict__ pxy_memory, float value,
+                                   size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    pxy_memory[idx] *= value;
+  }
+};
+
+void occupancy_kernel_pxy_times_v(size_t dim_x, size_t dim_y,
+                                  size_t number_of_pattern, size_t h_dim,
+                                  std::vector<size_t>& output,
+                                  bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_pxy_times_v, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_times_v:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_pxy_times_v.h
+++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_v.h
@ -0,0 +1,13 @@
+#ifndef KERNEL_PXY_TIMES_V
+#define KERNEL_PXY_TIMES_V
+#include <vector>
+
+__global__ void kernel_pxy_times_v(float* __restrict__ pxy_memory, float value,
+                                   size_t max_idx);
+
+void occupancy_kernel_pxy_times_v(size_t dim_x, size_t dim_y,
+                                  size_t number_of_pattern, size_t h_dim,
+                                  std::vector<size_t>& output,
+                                  bool display_debug);
+
+#endif /* KERNEL_PXY_TIMES_V */
--- a/network/CPP_Cuda_new_preview/kernel_spike_generation.cu
+++ b/network/CPP_Cuda_new_preview/kernel_spike_generation.cu
@ -0,0 +1,97 @@
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_spike_generation.h"
+
+__device__ size_t gpu_lower_bound(float* __restrict__ data_ptr,
+                                  size_t data_length, size_t data_ptr_stride,
+                                  float compare_to_value) {
+  size_t start_of_range = 0;
+  size_t length_of_range = data_length;
+
+  while (length_of_range != 0) {
+    size_t half_length = length_of_range >> 1;
+    size_t actual_position = start_of_range + half_length;
+
+    if (data_ptr[actual_position * data_ptr_stride] < compare_to_value) {
+      start_of_range = ++actual_position;
+      length_of_range -= half_length + 1;
+    } else
+      length_of_range = half_length;
+  }
+  return start_of_range;
+};
+
+__global__ void kernel_spike_generation(
+    float* __restrict__ input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+    size_t input_dim_c2, float* __restrict__ random_values_pointer,
+    size_t random_values_dim_c0, size_t random_values_dim_c1,
+    size_t random_values_dim_c2, int64_t* __restrict__ output_pointer,
+    size_t output_dim_c0, size_t output_dim_c1, size_t output_dim_c2,
+    size_t x_dim, size_t y_dim, size_t spike_dim, size_t h_dim,
+    size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2,
+    size_t max_threadable_tasks) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_threadable_tasks) {
+    size_t pattern_id = idx / block_dim_c0;
+    idx -= pattern_id * block_dim_c0;
+    size_t position_spike = idx / block_dim_c1;
+    idx -= position_spike * block_dim_c1;
+    size_t position_x = idx / block_dim_c2;
+    idx -= position_x * block_dim_c2;
+    size_t position_y = idx;
+
+    float* p_ptr = input_pointer + pattern_id * input_dim_c0 +
+                   position_x * input_dim_c2 + position_y;
+
+    int64_t* out_ptr = output_pointer + pattern_id * output_dim_c0 +
+                       position_x * output_dim_c2 + position_y +
+                       position_spike * output_dim_c1;
+
+    float* rand_ptr = random_values_pointer +
+                      pattern_id * random_values_dim_c0 +
+                      position_x * random_values_dim_c2 + position_y +
+                      position_spike * random_values_dim_c1;
+
+    *out_ptr = gpu_lower_bound(p_ptr, h_dim, input_dim_c1, *rand_ptr);
+  }
+};
+
+void occupancy_kernel_spike_generation(size_t dim_x, size_t dim_y,
+                                       size_t number_of_pattern,
+                                       size_t spike_dim,
+                                       std::vector<size_t>& output,
+                                       bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * spike_dim * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_spike_generation, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_spike_generation:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
--- a/network/CPP_Cuda_new_preview/kernel_spike_generation.h
+++ b/network/CPP_Cuda_new_preview/kernel_spike_generation.h
@ -0,0 +1,21 @@
+#ifndef KERNEL_SPIKE_GENERATION
+#define KERNEL_SPIKE_GENERATION
+#include <vector>
+
+__global__ void kernel_spike_generation(
+    float* __restrict__ input_pointer, size_t input_dim_c0, size_t input_dim_c1,
+    size_t input_dim_c2, float* __restrict__ random_values_pointer,
+    size_t random_values_dim_c0, size_t random_values_dim_c1,
+    size_t random_values_dim_c2, int64_t* __restrict__ output_pointer,
+    size_t output_dim_c0, size_t output_dim_c1, size_t output_dim_c2,
+    size_t x_dim, size_t y_dim, size_t spike_dim, size_t h_dim,
+    size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2,
+    size_t max_threadable_tasks);
+
+void occupancy_kernel_spike_generation(size_t dim_x, size_t dim_y,
+                                       size_t number_of_pattern,
+                                       size_t spike_dim,
+                                       std::vector<size_t>& output,
+                                       bool display_debug);
+
+#endif /* KERNEL_SPIKE_GENERATION */
--- a/network/CPP_Cuda_new_preview/pybind11_auto_pyi.py
+++ b/network/CPP_Cuda_new_preview/pybind11_auto_pyi.py
@ -0,0 +1,23 @@
+# %%
+# pip install pybind11-stubgen
+from pybind11_stubgen import ModuleStubsGenerator  # type: ignore
+import glob
+
+
+def process(module_name: str) -> None:
+    module = ModuleStubsGenerator(module_name)
+    module.parse()
+    module.write_setup_py = False
+
+    with open(module_name + ".pyi", "w") as fp:
+        fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n")
+        fp.write("\n".join(module.to_lines()))
+
+
+Files = glob.glob("*.so")
+
+for fid in Files:
+    Idx: int = fid.find(".")
+    module_name: str = fid[:Idx]
+    print("Processing: " + module_name)
+    process(module_name)
--- a/network/CPP_Cuda_new_preview/test_kernel.py
+++ b/network/CPP_Cuda_new_preview/test_kernel.py
@ -0,0 +1,767 @@
+import torch
+import math
+import random
+
+from PyTestKernel import TestKernel
+
+# TODO: kernel_phxy_plus_pxy, kernel_phxy_times_pxy,
+# kernel_phxy_fill_h, kernel_phxy_one_over_sum_into_pxy,
+# test_kernel_phxy_fill_with_spike_selected_w => 4D index
+
+# pxy = number_of_pattern * dim_x * dim_y
+# phxy = number_of_pattern * h_dim * dim_x * dim_y
+# sxy = s_dim * dim_x * dim_y
+
+
+def test_kernel_pxy_times_spike_selected_sxy(
+    h_dim,
+    s_dim,
+    number_of_pattern,
+    dim_x,
+    dim_y,
+    display_debug,
+    spike_time,
+    number_of_spikes,
+):
+    print("test_kernel_pxy_times_spike_selected_sxy")
+    #   void test_kernel_pxy_times_spike_selected_sxy(
+    #       size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    #       bool display_debug, int64_t pxy_memory_addr, int64_t sxy_memory_addr,
+    #       int64_t spike_memory_addr, size_t spike_time, size_t spike_dim_c0,
+    #       size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0,
+    #       size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1);
+
+    memory_pxy = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_sxy = torch.rand(
+        (s_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_spikes = (
+        torch.rand(
+            (number_of_pattern, number_of_spikes, dim_x, dim_y),
+            dtype=torch.float32,
+            device=torch.device("cuda:0"),
+        )
+        * float(s_dim)
+    ).type(dtype=torch.int64)
+
+    pxy_dim_c0 = int(dim_x * dim_y)
+    pxy_dim_c1 = int(dim_y)
+
+    sxy_dim_c0 = int(dim_x * dim_y)
+    sxy_dim_c1 = int(dim_y)
+
+    spike_dim_c0 = int(number_of_spikes * dim_x * dim_y)
+    spike_dim_c1 = int(dim_x * dim_y)
+    spike_dim_c2 = int(dim_y)
+
+    memory_pxy_copy = memory_pxy.clone()
+    memory_sxy_copy = memory_sxy.clone()
+    memory_spikes_copy = memory_spikes.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_pxy_times_spike_selected_sxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        memory_pxy.data_ptr(),
+        memory_sxy.data_ptr(),
+        memory_spikes.data_ptr(),
+        spike_time,
+        spike_dim_c0,
+        spike_dim_c1,
+        spike_dim_c2,
+        pxy_dim_c0,
+        pxy_dim_c1,
+        sxy_dim_c0,
+        sxy_dim_c1,
+    )
+
+    for p in range(0, memory_spikes_copy.shape[0]):
+        for x in range(0, memory_spikes_copy.shape[2]):
+            for y in range(0, memory_spikes_copy.shape[3]):
+                spike = memory_spikes_copy[p, spike_time, x, y]
+
+                if spike >= 0:
+                    memory_pxy_copy[p, x, y] *= memory_sxy_copy[spike, x, y]
+                else:
+                    memory_pxy_copy[p, x, y] = 0.0
+    print(f"difference: {torch.abs(memory_pxy - memory_pxy_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_fill_with_spike_selected_w(
+    h_dim,
+    s_dim,
+    number_of_pattern,
+    dim_x,
+    dim_y,
+    display_debug,
+    spike_time,
+    number_of_spikes,
+):
+    print("test_kernel_phxy_fill_with_spike_selected_w")
+    #   void test_kernel_phxy_fill_with_spike_selected_w(
+    #       size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    #       bool display_debug, size_t spike_time, size_t weights_dim_c0,
+    #       size_t spike_dim_c0, size_t spike_dim_c1, size_t spike_dim_c2,
+    #       size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2,
+    #       int64_t phxy_memory_addr, int64_t weight_memory_addr,
+    #       int64_t spike_memory_addr);
+
+    memory_phxy = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_w = torch.rand(
+        (s_dim, h_dim),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_spikes = (
+        torch.rand(
+            (number_of_pattern, number_of_spikes, dim_x, dim_y),
+            dtype=torch.float32,
+            device=torch.device("cuda:0"),
+        )
+        * float(s_dim)
+    ).type(dtype=torch.int64)
+
+    phxy_dim_c0 = int(h_dim * dim_x * dim_y)
+    phxy_dim_c1 = int(dim_x * dim_y)
+    phxy_dim_c2 = int(dim_y)
+
+    spike_dim_c0 = int(number_of_spikes * dim_x * dim_y)
+    spike_dim_c1 = int(dim_x * dim_y)
+    spike_dim_c2 = int(dim_y)
+
+    weights_dim_c0 = int(h_dim)
+
+    memory_phxy_copy = memory_phxy.clone()
+    memory_w_copy = memory_w.clone()
+    memory_spikes_copy = memory_spikes.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_fill_with_spike_selected_w(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        spike_time,
+        weights_dim_c0,
+        spike_dim_c0,
+        spike_dim_c1,
+        spike_dim_c2,
+        phxy_dim_c0,
+        phxy_dim_c1,
+        phxy_dim_c2,
+        memory_phxy.data_ptr(),
+        memory_w.data_ptr(),
+        memory_spikes.data_ptr(),
+    )
+
+    for p in range(0, memory_spikes_copy.shape[0]):
+        for x in range(0, memory_spikes_copy.shape[2]):
+            for y in range(0, memory_spikes_copy.shape[3]):
+                spike = memory_spikes_copy[p, spike_time, x, y]
+
+                if spike >= 0:
+                    memory_phxy_copy[p, :, x, y] = memory_w_copy[spike, :]
+                else:
+                    memory_phxy_copy[p, :, x, y] = 0.0
+
+    print(f"difference: {torch.abs(memory_phxy - memory_phxy_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_one_over_sum_into_pxy(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_phxy_one_over_sum_into_pxy")
+    #   void test_kernel_phxy_one_over_sum_into_pxy(
+    #       size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim,
+    #       bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1,
+    #       size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1,
+    #       int64_t phxy_memory_addr, int64_t pxy_memory_addr);
+
+    memory_a = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_b = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    pxy_dim_c0 = int(dim_x * dim_y)
+    pxy_dim_c1 = int(dim_y)
+
+    phxy_dim_c0 = int(h_dim * dim_x * dim_y)
+    phxy_dim_c1 = int(dim_x * dim_y)
+    phxy_dim_c2 = int(dim_y)
+
+    memory_a_copy = memory_a.clone()
+    memory_b_copy = memory_b.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_one_over_sum_into_pxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        phxy_dim_c0,
+        phxy_dim_c1,
+        phxy_dim_c2,
+        pxy_dim_c0,
+        pxy_dim_c1,
+        memory_a.data_ptr(),
+        memory_b.data_ptr(),
+    )
+    memory_temp_copy = memory_a_copy.sum(dim=1)
+
+    memory_b_copy = torch.where(memory_temp_copy > 1e-10, 1.0 / memory_temp_copy, 0.0)
+    print(
+        "Remember: \nAn error of 0 is very unlikely due to different \nrandom order of values for the sum."
+    )
+    print(f"difference: {torch.abs(memory_b - memory_b_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_fill_with_h(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_phxy_fill_with_h")
+    #   void test_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y,
+    #                                     size_t number_of_pattern, size_t h_dim,
+    #                                     bool display_debug, size_t phxy_dim_c0,
+    #                                     size_t phxy_dim_c1, size_t phxy_dim_c2,
+    #                                     int64_t h_memory_addr,
+    #                                     int64_t phxy_memory_addr);
+
+    memory_a = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_h = torch.rand(
+        (h_dim),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    phxy_dim_c0 = int(h_dim * dim_x * dim_y)
+    phxy_dim_c1 = int(dim_x * dim_y)
+    phxy_dim_c2 = int(dim_y)
+
+    memory_a_copy = memory_a.clone()
+    memory_h_copy = memory_h.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_fill_with_h(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        phxy_dim_c0,
+        phxy_dim_c1,
+        phxy_dim_c2,
+        memory_h.data_ptr(),
+        memory_a.data_ptr(),
+    )
+    for p in range(0, memory_a_copy.shape[0]):
+        for x in range(0, memory_a_copy.shape[2]):
+            for y in range(0, memory_a_copy.shape[3]):
+                memory_a_copy[p, :, x, y] = memory_h_copy
+
+    print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_plus_pxy(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_phxy_plus_pxy")
+    #   void test_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y,
+    #                                  size_t number_of_pattern, size_t h_dim,
+    #                                  bool display_debug, size_t phxy_dim_c0,
+    #                                  size_t phxy_dim_c1, size_t phxy_dim_c2,
+    #                                  size_t pxy_dim_c0, size_t pxy_dim_c1,
+    #                                  int64_t phxy_memory_addr,
+    #                                  int64_t pxy_memory_addr);
+
+    memory_a = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_b = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    pxy_dim_c0 = int(dim_x * dim_y)
+    pxy_dim_c1 = int(dim_y)
+
+    phxy_dim_c0 = int(h_dim * dim_x * dim_y)
+    phxy_dim_c1 = int(dim_x * dim_y)
+    phxy_dim_c2 = int(dim_y)
+
+    memory_a_copy = memory_a.clone()
+    memory_b_copy = memory_b.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_plus_pxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        phxy_dim_c0,
+        phxy_dim_c1,
+        phxy_dim_c2,
+        pxy_dim_c0,
+        pxy_dim_c1,
+        memory_a.data_ptr(),
+        memory_b.data_ptr(),
+    )
+
+    memory_a_copy += memory_b_copy.unsqueeze(1)
+
+    print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_times_pxy(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_phxy_times_pxy")
+    #   void test_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y,
+    #                                   size_t number_of_pattern, size_t h_dim,
+    #                                   bool display_debug, size_t phxy_dim_c0,
+    #                                   size_t phxy_dim_c1, size_t phxy_dim_c2,
+    #                                   size_t pxy_dim_c0, size_t pxy_dim_c1,
+    #                                   int64_t phxy_memory_addr,
+    #                                   int64_t pxy_memory_addr);
+
+    memory_a = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_b = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    pxy_dim_c0 = int(dim_x * dim_y)
+    pxy_dim_c1 = int(dim_y)
+
+    phxy_dim_c0 = int(h_dim * dim_x * dim_y)
+    phxy_dim_c1 = int(dim_x * dim_y)
+    phxy_dim_c2 = int(dim_y)
+
+    memory_a_copy = memory_a.clone()
+    memory_b_copy = memory_b.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_times_pxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        phxy_dim_c0,
+        phxy_dim_c1,
+        phxy_dim_c2,
+        pxy_dim_c0,
+        pxy_dim_c1,
+        memory_a.data_ptr(),
+        memory_b.data_ptr(),
+    )
+
+    memory_a_copy *= memory_b_copy.unsqueeze(1)
+
+    print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_times_phxy_equals_phxy(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_phxy_times_phxy_equals_phxy")
+    #   void test_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y,
+    #                                                size_t number_of_pattern,
+    #                                                size_t h_dim, bool display_debug,
+    #                                                int64_t phxy_memory_a_addr,
+    #                                                int64_t phxy_memory_b_addr,
+    #                                                int64_t phxy_memory_out_addr);
+
+    memory_a = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_b = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_out = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_a_copy = memory_a.clone()
+    memory_b_copy = memory_b.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_times_phxy_equals_phxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        memory_a.data_ptr(),
+        memory_b.data_ptr(),
+        memory_out.data_ptr(),
+    )
+
+    memory_out_copy = memory_a_copy * memory_b_copy
+
+    print(f"difference: {torch.abs(memory_out - memory_out_copy).max():.4e}")
+    print()
+
+
+def test_kernel_phxy_plus_phxy(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_pxy_time_pxy")
+    #   void test_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y,
+    #                                   size_t number_of_pattern, size_t h_dim,
+    #                                   bool display_debug,
+    #                                   int64_t phxy_memory_a_addr,
+    #                                   int64_t phxy_memory_b_addr);
+
+    memory_a = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_b = torch.rand(
+        (number_of_pattern, h_dim, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    memory_a_copy = memory_a.clone()
+    memory_b_copy = memory_b.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_phxy_plus_phxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        memory_a.data_ptr(),
+        memory_b.data_ptr(),
+    )
+
+    memory_a_copy += memory_b_copy
+
+    print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}")
+    print()
+
+
+def test_kernel_pxy_time_pxy(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_pxy_time_pxy")
+
+    #   void test_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y,
+    #                                 size_t number_of_pattern, size_t h_dim,
+    #                                 bool display_debug, int64_t pxy_memory_a_addr,
+    #                                 int64_t pxy_memory_b_addr);
+
+    epsilon_memory_a = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    epsilon_memory_b = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    epsilon_memory_a_copy = epsilon_memory_a.clone()
+    epsilon_memory_b_copy = epsilon_memory_b.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_pxy_time_pxy(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        epsilon_memory_a.data_ptr(),
+        epsilon_memory_b.data_ptr(),
+    )
+
+    epsilon_memory_a_copy *= epsilon_memory_b_copy
+
+    print(
+        f"difference: {torch.abs(epsilon_memory_a - epsilon_memory_a_copy).max():.4e}"
+    )
+    print()
+
+
+def test_kernel_pxy_times_v(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_pxy_times_v")
+
+    #   void test_kernel_pxy_times_v(size_t dim_x, size_t dim_y,
+    #                                size_t number_of_pattern, size_t h_dim,
+    #                                bool display_debug, float value,
+    #                                int64_t pxy_memory_addr);
+
+    epsilon_memory = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    epsilon_memory_copy = epsilon_memory.clone()
+    value = float(math.pi)
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_pxy_times_v(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        value,
+        epsilon_memory.data_ptr(),
+    )
+
+    epsilon_memory_copy = epsilon_memory_copy * value
+
+    print(f"difference: {torch.abs(epsilon_memory - epsilon_memory_copy).max():.4e}")
+    print()
+
+
+def test_kernel_pxy_plus_v(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_pxy_plus_v")
+    #   void test_kernel_pxy_plus_v(size_t dim_x, size_t dim_y,
+    #                               size_t number_of_pattern, size_t h_dim,
+    #                               bool display_debug, float value,
+    #                               int64_t pxy_memory_addr);
+
+    epsilon_memory = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    epsilon_memory_copy = epsilon_memory.clone()
+    value = float(math.pi)
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_pxy_plus_v(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        value,
+        epsilon_memory.data_ptr(),
+    )
+
+    epsilon_memory_copy = epsilon_memory_copy + value
+
+    print(f"difference: {torch.abs(epsilon_memory - epsilon_memory_copy).max():.4e}")
+    print()
+
+
+def test_kernel_pxy_set_to_v(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+
+    print("test_kernel_pxy_set_to_v")
+    #   void test_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y,
+    #                                 size_t number_of_pattern, size_t h_dim,
+    #                                 bool display_debug, float value,
+    #                                 int64_t pxy_memory_addr);
+
+    set_value = float(math.pi)
+
+    epsilon_memory = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_pxy_set_to_v(
+        dim_x,
+        dim_y,
+        number_of_pattern,
+        h_dim,
+        display_debug,
+        set_value,
+        epsilon_memory.data_ptr(),
+    )
+
+    print(f"difference: {torch.abs(epsilon_memory - set_value).max():.4e}")
+    print()
+
+
+def test_kernel_pxy_reciprocal(
+    h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+):
+    print("test_kernel_pxy_reciprocal")
+    #   void test_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+    #                                   size_t number_of_pattern, size_t h_dim,
+    #                                   bool display_debug, int64_t pxy_memory_addr);
+
+    epsilon_memory = torch.rand(
+        (number_of_pattern, dim_x, dim_y),
+        dtype=torch.float32,
+        device=torch.device("cuda:0"),
+    )
+
+    epsilon_memory_copy = epsilon_memory.clone()
+
+    my_kernels = TestKernel()
+    my_kernels.test_kernel_pxy_reciprocal(
+        dim_x, dim_y, number_of_pattern, h_dim, display_debug, epsilon_memory.data_ptr()
+    )
+
+    epsilon_memory_copy = 1.0 / epsilon_memory_copy
+
+    print(f"difference: {torch.abs(epsilon_memory - epsilon_memory_copy).max():.4e}")
+    print()
+
+
+if __name__ == "__main__":
+    input_set = 0
+
+    for test_id in range(0, 13):
+        print(f"Test-ID: {test_id}")
+
+        number_of_spikes: int = int(1600)
+        spike_time: int = int(random.random() * number_of_spikes)
+
+        if input_set == 0:
+            h_dim: int = int(32)
+            s_dim: int = int(1 * 5 * 5)
+            number_of_pattern: int = int(24)
+            dim_x: int = int(20)
+            dim_y: int = int(20)
+            display_debug: int = bool(False)
+        else:
+            h_dim = int(10)
+            s_dim = int(32 * 20 * 20)
+            number_of_pattern = int(24)
+            dim_x = int(1)
+            dim_y = int(1)
+            display_debug = bool(False)
+
+        if test_id == 0:
+            test_kernel_pxy_reciprocal(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 1:
+            test_kernel_pxy_set_to_v(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 2:
+            test_kernel_pxy_plus_v(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 3:
+            test_kernel_pxy_times_v(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 4:
+            test_kernel_pxy_time_pxy(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 5:
+            test_kernel_phxy_plus_phxy(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 6:
+            test_kernel_phxy_times_phxy_equals_phxy(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 7:
+            test_kernel_phxy_times_pxy(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 8:
+            test_kernel_phxy_plus_pxy(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 9:
+            test_kernel_phxy_fill_with_h(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 10:
+            test_kernel_phxy_one_over_sum_into_pxy(
+                h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug
+            )
+        elif test_id == 11:
+            test_kernel_phxy_fill_with_spike_selected_w(
+                h_dim,
+                s_dim,
+                number_of_pattern,
+                dim_x,
+                dim_y,
+                display_debug,
+                spike_time,
+                number_of_spikes,
+            )
+        elif test_id == 12:
+            test_kernel_pxy_times_spike_selected_sxy(
+                h_dim,
+                s_dim,
+                number_of_pattern,
+                dim_x,
+                dim_y,
+                display_debug,
+                spike_time,
+                number_of_spikes,
+            )