From 09c8b10f234975db6f32497874364dedf7c5cec8 Mon Sep 17 00:00:00 2001 From: David Rotermund <54365609+davrot@users.noreply.github.com> Date: Fri, 13 Jan 2023 21:33:57 +0100 Subject: [PATCH] Add files via upload --- .../CPP_Cuda_new_preview/HDynamicCNNManyIP.cu | 967 ++++++++++++++++++ .../CPP_Cuda_new_preview/HDynamicCNNManyIP.h | 114 +++ network/CPP_Cuda_new_preview/Makefile | 288 ++++++ network/CPP_Cuda_new_preview/MultiApp.cu | 273 +++++ network/CPP_Cuda_new_preview/MultiApp.h | 60 ++ .../PyHDynamicCNNManyIP.cpp | 18 + .../PyHDynamicCNNManyIP.pyi | 20 + network/CPP_Cuda_new_preview/PyMultiApp.cpp | 15 + network/CPP_Cuda_new_preview/PyMultiApp.pyi | 20 + .../PySpikeGeneration2DManyIP.cpp | 19 + .../PySpikeGeneration2DManyIP.pyi | 20 + network/CPP_Cuda_new_preview/PyTestKernel.cpp | 37 + network/CPP_Cuda_new_preview/PyTestKernel.pyi | 30 + .../SpikeGeneration2DManyIP.cu | 261 +++++ .../SpikeGeneration2DManyIP.h | 87 ++ network/CPP_Cuda_new_preview/TestKernel.cu | 329 ++++++ network/CPP_Cuda_new_preview/TestKernel.h | 89 ++ .../approximation_multiplication_function.cpp | 165 +++ .../approximation_multiplication_function.h | 11 + network/CPP_Cuda_new_preview/error_term.cpp | 34 + network/CPP_Cuda_new_preview/error_term.h | 7 + .../kernel_approximation_error_term.cu | 31 + .../kernel_approximation_multiplication.cu | 219 ++++ .../kernel_approximation_multiplication.h | 35 + .../kernel_helper_functions.cu | 17 + .../kernel_helper_functions.h | 7 + .../kernel_phxy_fill_with_h.cu | 64 ++ .../kernel_phxy_fill_with_h.h | 18 + .../kernel_phxy_fill_with_spike_selected_w.cu | 73 ++ .../kernel_phxy_fill_with_spike_selected_w.h | 17 + .../kernel_phxy_one_over_sum_into_pxy.cu | 74 ++ .../kernel_phxy_one_over_sum_into_pxy.h | 17 + .../kernel_phxy_plus_phxy.cu | 51 + .../kernel_phxy_plus_phxy.h | 12 + .../kernel_phxy_plus_pxy.cu | 70 ++ .../kernel_phxy_plus_pxy.h | 16 + .../kernel_phxy_times_phxy_equals_phxy.cu | 53 + .../kernel_phxy_times_phxy_equals_phxy.h | 15 + .../kernel_phxy_times_pxy.cu | 70 ++ .../kernel_phxy_times_pxy.h | 17 + .../CPP_Cuda_new_preview/kernel_pxy_plus_v.cu | 50 + .../CPP_Cuda_new_preview/kernel_pxy_plus_v.h | 12 + .../kernel_pxy_reciprocal.cu | 50 + .../kernel_pxy_reciprocal.h | 12 + .../kernel_pxy_set_to_v.cu | 50 + .../kernel_pxy_set_to_v.h | 13 + .../kernel_pxy_time_pxy.cu | 58 ++ .../kernel_pxy_time_pxy.h | 15 + .../kernel_pxy_times_spike_selected_sxy.cu | 73 ++ .../kernel_pxy_times_spike_selected_sxy.h | 18 + .../kernel_pxy_times_v.cu | 50 + .../CPP_Cuda_new_preview/kernel_pxy_times_v.h | 13 + .../kernel_spike_generation.cu | 97 ++ .../kernel_spike_generation.h | 21 + .../CPP_Cuda_new_preview/pybind11_auto_pyi.py | 23 + network/CPP_Cuda_new_preview/test_kernel.py | 767 ++++++++++++++ 56 files changed, 5062 insertions(+) create mode 100644 network/CPP_Cuda_new_preview/HDynamicCNNManyIP.cu create mode 100644 network/CPP_Cuda_new_preview/HDynamicCNNManyIP.h create mode 100644 network/CPP_Cuda_new_preview/Makefile create mode 100644 network/CPP_Cuda_new_preview/MultiApp.cu create mode 100644 network/CPP_Cuda_new_preview/MultiApp.h create mode 100644 network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.cpp create mode 100644 network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.pyi create mode 100644 network/CPP_Cuda_new_preview/PyMultiApp.cpp create mode 100644 network/CPP_Cuda_new_preview/PyMultiApp.pyi create mode 100644 network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.cpp create mode 100644 network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.pyi create mode 100644 network/CPP_Cuda_new_preview/PyTestKernel.cpp create mode 100644 network/CPP_Cuda_new_preview/PyTestKernel.pyi create mode 100644 network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.cu create mode 100644 network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.h create mode 100644 network/CPP_Cuda_new_preview/TestKernel.cu create mode 100644 network/CPP_Cuda_new_preview/TestKernel.h create mode 100644 network/CPP_Cuda_new_preview/approximation_multiplication_function.cpp create mode 100644 network/CPP_Cuda_new_preview/approximation_multiplication_function.h create mode 100644 network/CPP_Cuda_new_preview/error_term.cpp create mode 100644 network/CPP_Cuda_new_preview/error_term.h create mode 100644 network/CPP_Cuda_new_preview/kernel_approximation_error_term.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_approximation_multiplication.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_approximation_multiplication.h create mode 100644 network/CPP_Cuda_new_preview/kernel_helper_functions.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_helper_functions.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_plus_v.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_plus_v.h create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.h create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.h create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.h create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_times_v.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_pxy_times_v.h create mode 100644 network/CPP_Cuda_new_preview/kernel_spike_generation.cu create mode 100644 network/CPP_Cuda_new_preview/kernel_spike_generation.h create mode 100644 network/CPP_Cuda_new_preview/pybind11_auto_pyi.py create mode 100644 network/CPP_Cuda_new_preview/test_kernel.py diff --git a/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.cu b/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.cu new file mode 100644 index 0000000..ed463ee --- /dev/null +++ b/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.cu @@ -0,0 +1,967 @@ +#include +#include +#include + +#include +#include +#include + +#include "HDynamicCNNManyIP.h" +#include "approximation_multiplication_function.h" +#include "kernel_approximation_multiplication.h" +#include "kernel_phxy_fill_with_h.h" +#include "kernel_phxy_fill_with_spike_selected_w.h" +#include "kernel_phxy_one_over_sum_into_pxy.h" +#include "kernel_phxy_plus_phxy.h" +#include "kernel_phxy_plus_pxy.h" +#include "kernel_phxy_times_phxy_equals_phxy.h" +#include "kernel_phxy_times_pxy.h" +#include "kernel_pxy_plus_v.h" +#include "kernel_pxy_reciprocal.h" +#include "kernel_pxy_set_to_v.h" +#include "kernel_pxy_time_pxy.h" +#include "kernel_pxy_times_spike_selected_sxy.h" +#include "kernel_pxy_times_v.h" + +HDynamicCNNManyIP::HDynamicCNNManyIP(){ + +}; + +HDynamicCNNManyIP::~HDynamicCNNManyIP(){ + +}; + +bool HDynamicCNNManyIP::update_entrypoint( + int64_t h_pointer_addr, int64_t h_dim_0, int64_t h_dim_1, int64_t h_dim_2, + int64_t h_dim_3, int64_t epsilon_xy_pointer_addr, int64_t epsilon_xy_dim_0, + int64_t epsilon_xy_dim_1, int64_t epsilon_xy_dim_2, + int64_t epsilon_t_pointer_addr, int64_t epsilon_t_dim_0, + int64_t weights_pointer_addr, int64_t weights_dim_0, int64_t weights_dim_1, + int64_t input_pointer_addr, int64_t input_dim_0, int64_t input_dim_1, + int64_t input_dim_2, int64_t input_dim_3, int64_t init_vector_pointer_addr, + int64_t init_vector_dim_0, int64_t number_of_processes, + float forgetting_offset, int64_t gpu_tuning_factor + // ,bool approximation_multiplication_enable, uint64_t + // number_of_frac_bits, bool approximation_enable, + // uint64_t number_of_trunc_bits +) { + bool approximation_multiplication_enable = false; + uint64_t number_of_frac_bits = 1; + bool approximation_enable = false; + uint64_t number_of_trunc_bits = false; + + uint32_t ap_mask = static_cast(pow(2, number_of_trunc_bits)) - 1; + + size_t number_of_pattern = input_dim_0; + + size_t h_dim = init_vector_dim_0; + float* h_init_ptr = (float*)init_vector_pointer_addr; + assert((h_init_ptr != nullptr)); + assert((h_dim > 0)); + + float* h_pointer = (float*)h_pointer_addr; + assert((h_pointer != nullptr)); + assert((h_dim_0 > 0)); + assert((h_dim_1 > 0)); + assert((h_dim_2 > 0)); + assert((h_dim_3 > 0)); + + size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3; + size_t h_dim_c1 = h_dim_2 * h_dim_3; + size_t h_dim_c2 = h_dim_3; + + float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr; + assert((epsilon_xy_pointer != nullptr)); + assert((epsilon_xy_dim_0 > 0)); + assert((epsilon_xy_dim_1 > 0)); + + size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1; + size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2; + + float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr; + assert((epsilon_t_pointer != nullptr)); + assert((epsilon_t_dim_0 > 0)); + + float* weights_pointer = (float*)weights_pointer_addr; + assert((weights_pointer != nullptr)); + assert((weights_dim_0 > 0)); + assert((weights_dim_1 > 0)); + + size_t weights_dim_c0 = weights_dim_1; + + int64_t* input_pointer = (int64_t*)input_pointer_addr; + assert((input_pointer != nullptr)); + assert((input_dim_0 > 0)); + assert((input_dim_1 > 0)); + assert((input_dim_2 > 0)); + assert((input_dim_3 > 0)); + + size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3; + size_t input_dim_c1 = input_dim_2 * input_dim_3; + size_t input_dim_c2 = input_dim_3; + + assert((h_dim == weights_dim_1)); + size_t number_of_spikes = input_dim_1; + size_t dim_x = input_dim_2; + size_t dim_y = input_dim_3; + + float forgetting_offset_local = forgetting_offset / static_cast(h_dim); + + // -------------------- + if (number_of_processes > 0) { + omp_set_num_threads(number_of_processes); + + size_t pattern_id; +#pragma omp parallel for + for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) { + update(h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + epsilon_xy_pointer, epsilon_xy_dim_c0, epsilon_xy_dim_c1, + epsilon_t_pointer, weights_pointer, weights_dim_c0, input_pointer, + input_dim_c0, input_dim_c1, input_dim_c2, number_of_spikes, dim_x, + dim_y, forgetting_offset, forgetting_offset_local, pattern_id, + approximation_multiplication_enable, number_of_frac_bits, + approximation_enable, number_of_trunc_bits, ap_mask); + } + } else { + gpu_update(h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + epsilon_xy_pointer, epsilon_xy_dim_c0, epsilon_xy_dim_c1, + epsilon_t_pointer, weights_pointer, weights_dim_c0, + input_pointer, input_dim_c0, input_dim_c1, input_dim_c2, + number_of_spikes, dim_x, dim_y, forgetting_offset, + forgetting_offset_local, number_of_pattern, gpu_tuning_factor, + approximation_multiplication_enable, number_of_frac_bits, + approximation_enable, number_of_trunc_bits, ap_mask); + } + return true; +}; + +bool HDynamicCNNManyIP::update( + float* h_init_ptr, float* h_pointer, size_t h_dim_c0, size_t h_dim_c1, + size_t h_dim_c2, size_t h_dim, float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, size_t epsilon_xy_dim_c1, + float* epsilon_t_pointer, float* weights_pointer, size_t weights_dim_c0, + int64_t* input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, size_t number_of_spikes, size_t dim_x, size_t dim_y, + float forgetting_offset, float forgetting_offset_local, size_t pattern_id, + bool approximation_multiplication_enable, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, + uint32_t ap_mask) { + float* h_ptr; + float* epsilon_xy_ptr; + int64_t* input_ptr; + + size_t counter_x; + size_t counter_y; + + for (counter_x = 0; counter_x < dim_x; counter_x++) { + for (counter_y = 0; counter_y < dim_y; counter_y++) { + epsilon_xy_ptr = + epsilon_xy_pointer + counter_x * epsilon_xy_dim_c1 + counter_y; + + h_ptr = + h_pointer + pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y; + + input_ptr = input_pointer + pattern_id * input_dim_c0 + + counter_x * input_dim_c2 + counter_y; + + if (approximation_multiplication_enable == false) { + update_one_ip(h_init_ptr, h_ptr, h_dim_c1, h_dim, weights_pointer, + weights_dim_c0, input_ptr, input_dim_c1, epsilon_xy_ptr, + epsilon_xy_dim_c0, epsilon_t_pointer, number_of_spikes, + forgetting_offset, forgetting_offset_local); + } else { + update_one_ip_approx( + h_init_ptr, h_ptr, h_dim_c1, h_dim, weights_pointer, weights_dim_c0, + input_ptr, input_dim_c1, epsilon_xy_ptr, epsilon_xy_dim_c0, + epsilon_t_pointer, number_of_spikes, forgetting_offset, + forgetting_offset_local, approximation_multiplication_enable, + number_of_frac_bits, approximation_enable, number_of_trunc_bits, + ap_mask); + } + } + } + + return true; +}; + +void HDynamicCNNManyIP::update_one_ip_approx( + float* h_init_ptr, float* h_pointer, size_t h_dim_c1, size_t h_dim, + float* weights_pointer, size_t weights_dim_c0, int64_t* input_pointer, + size_t input_dim_c1, float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0, + float* epsilon_t_pointer, size_t number_of_spikes, float forgetting_offset, + float forgetting_offset_local, bool approximation_multiplication_enable, + uint64_t number_of_frac_bits, bool approximation_enable, + uint64_t number_of_trunc_bits, uint32_t ap_mask) { + float* h_temp = new float[h_dim]; + float* h_subsegment = new float[h_dim]; + + memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim); + + size_t counter_spike; + size_t counter; + + float h_temp_sum; + float temp_value; + + float epsilon_subsegment; + float epsilon_scale = 1.0; + + int64_t* spike; + float* w_ptr; + + // --------------- + // Approx... + + uint64_t pattern_size = h_dim; + + std::vector ap_h_vector; + ap_h_vector.resize(pattern_size); + float* ap_h_ptr = ap_h_vector.data(); + + std::vector ap_x_vector; + ap_x_vector.resize(pattern_size); + uint32_t* ap_x_ptr = ap_x_vector.data(); + + std::vector ap_y_vector; + ap_y_vector.resize(pattern_size); + uint32_t* ap_y_ptr = ap_y_vector.data(); + + std::vector ap_x_exponent_vector; + ap_x_exponent_vector.resize(pattern_size); + uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data(); + + std::vector ap_y_exponent_vector; + ap_y_exponent_vector.resize(pattern_size); + uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data(); + + std::vector ap_h_exponent_vector; + ap_h_exponent_vector.resize(pattern_size); + uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data(); + + std::vector ap_res_vector; + ap_res_vector.resize(pattern_size); + uint64_t* ap_res_ptr = ap_res_vector.data(); + + std::vector sign_temp_vector; + sign_temp_vector.resize(pattern_size); + uint32_t* sign_temp_ptr = sign_temp_vector.data(); + + // -------------- + + for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) { + if (epsilon_scale > 1E10) { + temp_value = 1.0 / epsilon_scale; + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_subsegment[counter] *= temp_value; + } + + epsilon_scale = 1.0; + } + + spike = input_pointer + counter_spike * input_dim_c1; + + if (*spike >= 0) { + epsilon_subsegment = epsilon_xy_pointer[*spike * epsilon_xy_dim_c0] * + epsilon_t_pointer[counter_spike]; + + w_ptr = weights_pointer + *spike * weights_dim_c0; + + memcpy(h_temp, h_subsegment, sizeof(float) * h_dim); + + approximation_multiplication_function( + ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits, + number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr, + ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr, + sign_temp_ptr, approximation_enable); + // -------------------------- + + h_temp_sum = 0.0; +#pragma omp simd reduction(+ : h_temp_sum) + for (counter = 0; counter < h_dim; counter++) { + h_temp_sum += h_temp[counter]; + } + + if (h_temp_sum > 1E-10) { + temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum; + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_temp[counter] *= temp_value; + } + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_subsegment[counter] += h_temp[counter]; + } + + if (forgetting_offset_local > 0.0) { + temp_value = + epsilon_scale * epsilon_subsegment * forgetting_offset_local; + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_subsegment[counter] += temp_value; + } + + epsilon_scale *= 1.0 + epsilon_subsegment * (1.0 + forgetting_offset); + } else { + epsilon_scale *= 1.0 + epsilon_subsegment * 1.0; + } + } + } + } + + temp_value = 1.0 / epsilon_scale; +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_pointer[counter * h_dim_c1] = h_subsegment[counter] * temp_value; + } + + delete[] h_temp; + delete[] h_subsegment; + + return; +}; + +void HDynamicCNNManyIP::update_one_ip( + float* h_init_ptr, float* h_pointer, size_t h_dim_c1, size_t h_dim, + float* weights_pointer, size_t weights_dim_c0, int64_t* input_pointer, + size_t input_dim_c1, float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0, + float* epsilon_t_pointer, size_t number_of_spikes, float forgetting_offset, + float forgetting_offset_local) { + float* h_temp = new float[h_dim]; + float* h_subsegment = new float[h_dim]; + + memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim); + + size_t counter_spike; + size_t counter; + + float h_temp_sum; + float temp_value; + + float epsilon_subsegment; + float epsilon_scale = 1.0; + + int64_t* spike; + float* w_ptr; + + // -------------- + + for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) { + if (epsilon_scale > 1E10) { + temp_value = 1.0 / epsilon_scale; + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_subsegment[counter] *= temp_value; + } + + epsilon_scale = 1.0; + } + + spike = input_pointer + counter_spike * input_dim_c1; + + if (*spike >= 0) { + epsilon_subsegment = epsilon_xy_pointer[*spike * epsilon_xy_dim_c0] * + epsilon_t_pointer[counter_spike]; + + w_ptr = weights_pointer + *spike * weights_dim_c0; + + memcpy(h_temp, h_subsegment, sizeof(float) * h_dim); + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_temp[counter] *= w_ptr[counter]; + } + + h_temp_sum = 0.0; +#pragma omp simd reduction(+ : h_temp_sum) + for (counter = 0; counter < h_dim; counter++) { + h_temp_sum += h_temp[counter]; + } + + if (h_temp_sum > 1E-10) { + temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum; + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_temp[counter] *= temp_value; + } + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_subsegment[counter] += h_temp[counter]; + } + + if (forgetting_offset_local > 0.0) { + temp_value = + epsilon_scale * epsilon_subsegment * forgetting_offset_local; + +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_subsegment[counter] += temp_value; + } + + epsilon_scale *= 1.0 + epsilon_subsegment * (1.0 + forgetting_offset); + } else { + epsilon_scale *= 1.0 + epsilon_subsegment * 1.0; + } + } + } + } + + temp_value = 1.0 / epsilon_scale; +#pragma omp simd + for (counter = 0; counter < h_dim; counter++) { + h_pointer[counter * h_dim_c1] = h_subsegment[counter] * temp_value; + } + + delete[] h_temp; + delete[] h_subsegment; + + return; +}; + +// ------------------------------------------------ + +void HDynamicCNNManyIP::gpu_occupancy_measure(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim) { + grid_and_thread_calculated = false; + assert((dim_x < 65535)); + assert((dim_y < 65535)); + + grid_and_thread_settings.resize(14); + + occupancy_kernel_phxy_plus_phxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY], display_debug); + + occupancy_kernel_pxy_plus_v(dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V], + display_debug); + + occupancy_kernel_pxy_times_v(dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V], + display_debug); + + occupancy_kernel_phxy_fill_with_h( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H], display_debug); + + occupancy_kernel_phxy_plus_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY], display_debug); + + occupancy_kernel_pxy_reciprocal( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug); + + occupancy_kernel_phxy_fill_with_spike_selected_w( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W], + display_debug); + + occupancy_kernel_phxy_times_phxy_equals_phxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY], + display_debug); + + occupancy_kernel_pxy_set_to_v( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V], display_debug); + + occupancy_kernel_phxy_one_over_sum_into_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY], + display_debug); + + occupancy_kernel_phxy_times_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY], display_debug); + + occupancy_kernel_pxy_time_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY], display_debug); + + occupancy_kernel_approximation_pure_multiplication( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION], + display_debug); + + occupancy_kernel_pxy_times_spike_selected_sxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY], + display_debug); + + grid_and_thread_calculated = true; + return; +}; + +void HDynamicCNNManyIP::gpu_occupancy_export( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1) { + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS)); + + gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim); + assert((grid_and_thread_calculated == true)); + + assert((setting_dim_0 == grid_and_thread_settings.size())); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) { + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) { + setting_memory[counter_0 * setting_dim_1 + counter_1] = + grid_and_thread_settings[counter_0][counter_1]; + } + } +}; + +void HDynamicCNNManyIP::gpu_occupancy_import(int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) { + grid_and_thread_calculated = false; + + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS)); + assert((setting_dim_0 == H_DYNAMIC_NUMBER_OF_KERNELS)); + + grid_and_thread_settings.resize(H_DYNAMIC_NUMBER_OF_KERNELS); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) { + grid_and_thread_settings[counter_0].resize( + H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS); + + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) { + grid_and_thread_settings[counter_0][counter_1] = + setting_memory[counter_0 * setting_dim_1 + counter_1]; + } + } + + grid_and_thread_calculated = true; +}; + +bool HDynamicCNNManyIP::gpu_update( + float* h_init_ptr, float* h_pointer, size_t h_dim_c0, size_t h_dim_c1, + size_t h_dim_c2, size_t h_dim, float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, size_t epsilon_xy_dim_c1, + float* epsilon_t_pointer, float* weights_pointer, size_t weights_dim_c0, + int64_t* input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, size_t number_of_spikes, size_t dim_x, size_t dim_y, + float forgetting_offset, float forgetting_offset_local, + size_t number_of_pattern, size_t gpu_tuning_factor, + bool approximation_multiplication_enable, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, + uint32_t ap_mask) { + if (grid_and_thread_calculated == false) { + gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim); + } + assert((grid_and_thread_calculated == true)); + + cudaError_t status; + + size_t h_sum_dim_c0 = dim_x * dim_y; + size_t h_sum_dim_c1 = dim_y; + + size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y; + size_t phxy_block_dim_c1 = dim_x * dim_y; + size_t phxy_block_dim_c2 = dim_y; + + size_t pxy_block_dim_c0 = dim_x * dim_y; + size_t pxy_block_dim_c1 = dim_y; + + float* w_memory = nullptr; + status = cudaMalloc((void**)&w_memory, number_of_pattern * h_dim * dim_x * + dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + float* h_temp_memory = nullptr; + status = + cudaMalloc((void**)&h_temp_memory, + number_of_pattern * h_dim * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + float* h_sum_memory = nullptr; + status = cudaMalloc((void**)&h_sum_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + float* epsilon_subsegment_memory = nullptr; + status = cudaMalloc((void**)&epsilon_subsegment_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + float* epsilon_scale_memory = nullptr; + status = cudaMalloc((void**)&epsilon_scale_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + float* forget_memory = nullptr; + status = cudaMalloc((void**)&forget_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + // --- + + // Initialize h + kernel_phxy_fill_with_h<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][0], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][1], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][3], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][4], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][5])>>>( + h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // Set epsilon memory scale to 1.0 + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + epsilon_scale_memory, 1.0, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + for (size_t counter_spike = 0; counter_spike < number_of_spikes; + counter_spike++) { + // Get epsilon_t from gpu memory + float epsilon_t; + status = cudaMemcpy(&epsilon_t, &epsilon_t_pointer[counter_spike], + sizeof(float), cudaMemcpyDeviceToHost); + assert((status == cudaSuccess)); + // Set epsilon memory subsegment to epsilon(t) + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + epsilon_subsegment_memory, epsilon_t, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // Set epsilon memory subsegment to forgetting_offset_local + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + forget_memory, forgetting_offset_local, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // if (*spike >= 0) { + // epsilon_subsegment = *epsilon_xy_pointer[*spike * + // epsilon_xy_dim_c0] + kernel_pxy_times_spike_selected_sxy<<< + dim3( + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY] + [2]), + dim3( + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY] + [5])>>>( + epsilon_subsegment_memory, epsilon_xy_pointer, input_pointer, + counter_spike, input_dim_c0, input_dim_c1, input_dim_c2, + epsilon_xy_dim_c0, epsilon_xy_dim_c1, epsilon_xy_dim_c0, + epsilon_xy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // Get the weight vectors according the spikes + kernel_phxy_fill_with_spike_selected_w<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [0], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [1], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [3], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [4], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [5])>>>( + w_memory, weights_pointer, input_pointer, counter_spike, weights_dim_c0, + input_dim_c0, input_dim_c1, input_dim_c2, h_dim_c0, h_dim_c1, h_dim_c2, + h_dim, phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // h_temp = h * w + if (approximation_multiplication_enable == false) { + kernel_phxy_times_phxy_equals_phxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [0], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [1], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [3], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [4], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [5])>>>( + h_pointer, w_memory, h_temp_memory, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY][6]); + + } else { + kernel_approximation_pure_multiplication<<< + dim3(grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION] + [0], + grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION] + [1], + grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION] + [2]), + dim3(grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION] + [3], + grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION] + [4], + grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION] + [5])>>>( + h_pointer, w_memory, h_temp_memory, number_of_frac_bits, + approximation_enable, number_of_trunc_bits, ap_mask, + grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION][6]); + } + + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // 1 / sum h_temp + kernel_phxy_one_over_sum_into_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY] + [5])>>>( + h_temp_memory, h_sum_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // epsilon_scale / sum h_temp + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + h_sum_memory, epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // epsilon_subsegment * epsilon_scale / sum h_temp + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + h_sum_memory, epsilon_subsegment_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // epsilon_scale * forget_memory which contains forgetting_offset_local + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + forget_memory, epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // delta_forget = epsilon_subsegment * epsilon_scale * forget_memory + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + forget_memory, epsilon_subsegment_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // delta_h = h_temp_memory * epsilon_subsegment * epsilon_scale / sum h + kernel_phxy_times_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][5])>>>( + h_temp_memory, h_sum_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // h + delta_h + kernel_phxy_plus_phxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][5])>>>( + h_pointer, h_temp_memory, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // h + delta_h + delta_forget + kernel_phxy_plus_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][5])>>>( + h_pointer, forget_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + kernel_pxy_times_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][5])>>>( + epsilon_subsegment_memory, (1.0 + forgetting_offset), + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + kernel_pxy_plus_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][5])>>>( + epsilon_subsegment_memory, 1.0, + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // epsilon_scale * epsilon_subsegment + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + epsilon_scale_memory, epsilon_subsegment_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + if (((counter_spike > 0) && (counter_spike % 1000 == 0)) || + (counter_spike + 1 == number_of_spikes)) { + kernel_pxy_reciprocal<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>( + epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + kernel_phxy_times_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][5])>>>( + h_pointer, epsilon_scale_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + // Set epsilon memory scale to 1.0 + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + epsilon_scale_memory, 1.0, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + } + } + + // ------------ + + status = cudaFree(w_memory); + assert((status == cudaSuccess)); + + status = cudaFree(h_temp_memory); + assert((status == cudaSuccess)); + + status = cudaFree(h_sum_memory); + assert((status == cudaSuccess)); + + status = cudaFree(epsilon_subsegment_memory); + assert((status == cudaSuccess)); + + status = cudaFree(epsilon_scale_memory); + assert((status == cudaSuccess)); + + status = cudaFree(forget_memory); + assert((status == cudaSuccess)); + + return true; +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.h b/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.h new file mode 100644 index 0000000..ff7313a --- /dev/null +++ b/network/CPP_Cuda_new_preview/HDynamicCNNManyIP.h @@ -0,0 +1,114 @@ +#ifndef HDYNAMICCNNMANYIP +#define HDYNAMICCNNMANYIP + +#include +#include + +#include +#include +#include + +#define ID_KERNEL_PHXY_PLUS_PHXY 0 +#define ID_KERNEL_PXY_PLUS_V 1 +#define ID_KERNEL_PXY_TIMES_V 2 +#define ID_KERNEL_PHXY_FILL_WITH_H 3 +#define ID_KERNEL_PHXY_PLUS_PXY 4 +#define ID_KERNEL_PXY_RECIPROCAL 5 +#define ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W 6 +#define ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY 7 +#define ID_KERNEL_PXY_SET_TO_V 8 +#define ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY 9 +#define ID_KERNEL_PHXY_TIMES_PXY 10 +#define ID_KERNEL_PXY_TIME_PXY 11 +#define ID_KERNEL_APPROXIMATION_MULTIPLICATION 12 +#define ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY 13 + +#define H_DYNAMIC_NUMBER_OF_KERNELS 14 +#define H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS 7 + +class HDynamicCNNManyIP { + public: + HDynamicCNNManyIP(); + ~HDynamicCNNManyIP(); + + bool update_entrypoint(int64_t h_pointer_addr, int64_t h_dim_0, + int64_t h_dim_1, int64_t h_dim_2, int64_t h_dim_3, + int64_t epsilon_xy_pointer_addr, + int64_t epsilon_xy_dim_0, int64_t epsilon_xy_dim_1, + int64_t epsilon_xy_dim_2, + int64_t epsilon_t_pointer_addr, + int64_t epsilon_t_dim_0, int64_t weights_pointer_addr, + int64_t weights_dim_0, int64_t weights_dim_1, + int64_t input_pointer_addr, int64_t input_dim_0, + int64_t input_dim_1, int64_t input_dim_2, + int64_t input_dim_3, int64_t init_vector_pointer_addr, + int64_t init_vector_dim_0, int64_t number_of_processes, + float forgetting_offset, int64_t gpu_tuning_factor + // ,bool approximation_multiplication_enable, uint64_t + // number_of_frac_bits, bool approximation_enable, + // uint64_t number_of_trunc_bits + ); + + void gpu_occupancy_export(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + int64_t setting_memory_addr, size_t setting_dim_0, + size_t setting_dim_1); + + void gpu_occupancy_import(int64_t setting_memory_addr, size_t setting_dim_0, + size_t setting_dim_1); + + private: + bool update(float* h_init_ptr, float* h_pointer, size_t h_dim_c0, + size_t h_dim_c1, size_t h_dim_c2, size_t h_dim, + float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, float* epsilon_t_pointer, + float* weights_pointer, size_t weights_dim_c0, + int64_t* input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, size_t number_of_spikes, size_t dim_x, + size_t dim_y, float forgetting_offset, + float forgetting_offset_local, size_t pattern_id, + bool approximation_multiplication_enable, + uint64_t number_of_frac_bits, bool approximation_enable, + uint64_t number_of_trunc_bits, uint32_t ap_mask); + + void update_one_ip(float* h_init_ptr, float* h_pointer, size_t h_dim_c1, + size_t h_dim, float* weights_pointer, + size_t weights_dim_c0, int64_t* input_pointer, + size_t input_dim_c1, float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, float* epsilon_t_pointer, + size_t number_of_spikes, float forgetting_offset, + float forgetting_offset_local); + + void update_one_ip_approx( + float* h_init_ptr, float* h_pointer, size_t h_dim_c1, size_t h_dim, + float* weights_pointer, size_t weights_dim_c0, int64_t* input_pointer, + size_t input_dim_c1, float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0, + float* epsilon_t_pointer, size_t number_of_spikes, + float forgetting_offset, float forgetting_offset_local, + bool approximation_multiplication_enable, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, + uint32_t ap_mask); + + bool gpu_update(float* h_init_ptr, float* h_pointer, size_t h_dim_c0, + size_t h_dim_c1, size_t h_dim_c2, size_t h_dim, + float* epsilon_xy_pointer, size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, float* epsilon_t_pointer, + float* weights_pointer, size_t weights_dim_c0, + int64_t* input_pointer, size_t input_dim_c0, + size_t input_dim_c1, size_t input_dim_c2, + size_t number_of_spikes, size_t dim_x, size_t dim_y, + float forgetting_offset, float forgetting_offset_local, + size_t number_of_pattern, size_t gpu_tuning_factor, + bool approximation_multiplication_enable, + uint64_t number_of_frac_bits, bool approximation_enable, + uint64_t number_of_trunc_bits, uint32_t ap_mask); + + void gpu_occupancy_measure(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim); + + bool grid_and_thread_calculated = false; + std::vector> grid_and_thread_settings; + bool display_debug = false; +}; + +#endif /* HDYNAMICCNNMANYIP */ diff --git a/network/CPP_Cuda_new_preview/Makefile b/network/CPP_Cuda_new_preview/Makefile new file mode 100644 index 0000000..9a22dcc --- /dev/null +++ b/network/CPP_Cuda_new_preview/Makefile @@ -0,0 +1,288 @@ +# Change to your python bin directory (tested with Python 3.10.4) +PYBIN=~/P3.10GPU/bin/ +NVCC=/usr/local/cuda-12/bin/nvcc +CC=/usr/lib64/ccache/clang++ + +PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes` +PARAMETERS_O=--allow-unsupported-compiler \ + -O3 -std=c++14 \ + $(PYBIND11INCLUDE) \ + -ccbin=$(CC) \ + -Xcompiler "-fPIC -Wall -fopenmp=libomp" \ + --gpu-architecture=sm_86 \ + --generate-line-info + +PARAMETERS_Linker=--allow-unsupported-compiler \ + -Xcompiler "-shared -lm -lomp -lstdc++ -Wall" \ + --gpu-architecture=sm_86 \ + --generate-line-info + +PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix` + + +all: PyHDynamicCNNManyIP \ + PySpikeGeneration2DManyIP \ + PyMultiApp \ + PyTestKernel + +####################### + +HDynamicCNNManyIP.o: \ + HDynamicCNNManyIP.h \ + HDynamicCNNManyIP.cu \ + kernel_helper_functions.h \ + kernel_phxy_plus_pxy.h \ + kernel_pxy_plus_v.h \ + kernel_pxy_time_pxy.h \ + kernel_phxy_fill_with_h.h \ + kernel_phxy_times_phxy_equals_phxy.h \ + kernel_pxy_reciprocal.h \ + kernel_pxy_times_v.h \ + kernel_phxy_plus_phxy.h \ + kernel_phxy_times_pxy.h \ + kernel_pxy_set_to_v.h \ + kernel_phxy_one_over_sum_into_pxy.h \ + approximation_multiplication_function.h \ + kernel_approximation_multiplication.h \ + kernel_pxy_times_spike_selected_sxy.h \ + kernel_phxy_fill_with_spike_selected_w.h + $(NVCC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cu -o HDynamicCNNManyIP.o + +PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp + $(NVCC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o + +PyHDynamicCNNManyIP: \ + HDynamicCNNManyIP.o \ + PyHDynamicCNNManyIP.o \ + kernel_pxy_plus_v.o \ + kernel_pxy_set_to_v.o \ + kernel_pxy_reciprocal.o \ + kernel_pxy_time_pxy.o \ + kernel_pxy_times_v.o \ + kernel_phxy_times_phxy_equals_phxy.o \ + kernel_phxy_plus_phxy.o \ + kernel_phxy_plus_pxy.o \ + kernel_phxy_times_pxy.o \ + kernel_phxy_fill_with_h.o\ + kernel_helper_functions.o\ + kernel_phxy_one_over_sum_into_pxy.o \ + approximation_multiplication_function.o \ + error_term.o \ + kernel_approximation_multiplication.o \ + kernel_pxy_times_spike_selected_sxy.o \ + kernel_phxy_fill_with_spike_selected_w.o + $(NVCC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP \ + HDynamicCNNManyIP.o \ + PyHDynamicCNNManyIP.o \ + kernel_pxy_plus_v.o \ + kernel_pxy_set_to_v.o \ + kernel_pxy_reciprocal.o \ + kernel_pxy_time_pxy.o \ + kernel_pxy_times_v.o \ + kernel_phxy_times_phxy_equals_phxy.o \ + kernel_phxy_plus_phxy.o \ + kernel_phxy_plus_pxy.o \ + kernel_phxy_times_pxy.o \ + kernel_phxy_fill_with_h.o \ + kernel_helper_functions.o \ + kernel_phxy_one_over_sum_into_pxy.o \ + approximation_multiplication_function.o \ + error_term.o \ + kernel_approximation_multiplication.o \ + kernel_pxy_times_spike_selected_sxy.o \ + kernel_phxy_fill_with_spike_selected_w.o + cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX) + $(PYBIN)python3 pybind11_auto_pyi.py + +####################### + +SpikeGeneration2DManyIP.o: \ + SpikeGeneration2DManyIP.h \ + SpikeGeneration2DManyIP.cu \ + kernel_spike_generation.h + $(NVCC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cu -o SpikeGeneration2DManyIP.o + +PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp + $(NVCC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o + +PySpikeGeneration2DManyIP: \ + SpikeGeneration2DManyIP.o \ + PySpikeGeneration2DManyIP.o \ + kernel_helper_functions.o \ + kernel_spike_generation.o + $(NVCC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP \ + SpikeGeneration2DManyIP.o \ + PySpikeGeneration2DManyIP.o \ + kernel_helper_functions.o \ + kernel_spike_generation.o + cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX) + $(PYBIN)python3 pybind11_auto_pyi.py + + +####################### + +MultiApp.o: \ + MultiApp.h \ + MultiApp.cu \ + approximation_multiplication_function.h \ + kernel_approximation_multiplication.h\ + error_term.cpp + $(NVCC) $(PARAMETERS_O) -c MultiApp.cu -o MultiApp.o + +PyMultiApp.o: MultiApp.h PyMultiApp.cpp + $(NVCC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o + +PyMultiApp: \ + MultiApp.o\ + PyMultiApp.o \ + approximation_multiplication_function.o \ + error_term.o \ + kernel_approximation_multiplication.o \ + kernel_helper_functions.o + $(NVCC) $(PARAMETERS_Linker) -o PyMultiApp \ + MultiApp.o\ + PyMultiApp.o \ + approximation_multiplication_function.o \ + error_term.o \ + kernel_approximation_multiplication.o \ + kernel_helper_functions.o + cp PyMultiApp PyMultiApp$(PYPOSTFIX) + $(PYBIN)python3 pybind11_auto_pyi.py + +####################### +clean: + rm -f PyHDynamicCNNManyIP + rm -f PySpikeGeneration2DManyIP + rm -f PyMultiApp + rm -f PyTestKernel + rm -f *.o + rm -f *.so + +####################### + +TestKernel.o: \ + TestKernel.cu \ + TestKernel.h \ + kernel_helper_functions.h \ + kernel_phxy_plus_pxy.h \ + kernel_pxy_plus_v.h \ + kernel_pxy_time_pxy.h \ + kernel_phxy_fill_with_h.h \ + kernel_phxy_times_phxy_equals_phxy.h \ + kernel_pxy_reciprocal.h \ + kernel_pxy_times_v.h \ + kernel_phxy_plus_phxy.h \ + kernel_phxy_times_pxy.h \ + kernel_pxy_set_to_v.h\ + kernel_phxy_one_over_sum_into_pxy.h\ + kernel_pxy_times_spike_selected_sxy.h \ + kernel_phxy_fill_with_spike_selected_w.h + $(NVCC) $(PARAMETERS_O) -c TestKernel.cu -o TestKernel.o + +PyTestKernel.o: PyTestKernel.cpp TestKernel.h + $(NVCC) $(PARAMETERS_O) -c PyTestKernel.cpp -o PyTestKernel.o + +PyTestKernel: \ + TestKernel.o \ + PyTestKernel.o \ + kernel_pxy_plus_v.o \ + kernel_pxy_set_to_v.o \ + kernel_pxy_reciprocal.o \ + kernel_pxy_time_pxy.o \ + kernel_pxy_times_v.o \ + kernel_phxy_times_phxy_equals_phxy.o \ + kernel_phxy_plus_phxy.o \ + kernel_phxy_plus_pxy.o \ + kernel_phxy_times_pxy.o \ + kernel_phxy_fill_with_h.o\ + kernel_helper_functions.o\ + kernel_phxy_one_over_sum_into_pxy.o\ + kernel_pxy_times_spike_selected_sxy.o \ + kernel_phxy_fill_with_spike_selected_w.o + + $(NVCC) $(PARAMETERS_Linker) -o PyTestKernel \ + TestKernel.o \ + PyTestKernel.o \ + kernel_pxy_plus_v.o \ + kernel_pxy_set_to_v.o \ + kernel_pxy_reciprocal.o \ + kernel_pxy_time_pxy.o \ + kernel_pxy_times_v.o \ + kernel_phxy_times_phxy_equals_phxy.o \ + kernel_phxy_plus_phxy.o \ + kernel_phxy_plus_pxy.o \ + kernel_phxy_times_pxy.o \ + kernel_phxy_fill_with_h.o \ + kernel_helper_functions.o \ + kernel_phxy_one_over_sum_into_pxy.o \ + kernel_pxy_times_spike_selected_sxy.o \ + kernel_phxy_fill_with_spike_selected_w.o + cp PyTestKernel PyTestKernel$(PYPOSTFIX) + $(PYBIN)python3 pybind11_auto_pyi.py + +kernel_pxy_plus_v.o: kernel_pxy_plus_v.cu kernel_pxy_plus_v.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_plus_v.cu -o kernel_pxy_plus_v.o + +kernel_pxy_set_to_v.o: kernel_pxy_set_to_v.cu kernel_pxy_set_to_v.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_set_to_v.cu -o kernel_pxy_set_to_v.o + +kernel_pxy_reciprocal.o: kernel_pxy_reciprocal.cu kernel_pxy_reciprocal.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_reciprocal.cu -o kernel_pxy_reciprocal.o + +kernel_pxy_time_pxy.o: kernel_pxy_time_pxy.cu kernel_pxy_time_pxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_time_pxy.cu -o kernel_pxy_time_pxy.o + +kernel_pxy_times_v.o: kernel_pxy_times_v.cu kernel_pxy_times_v.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_times_v.cu -o kernel_pxy_times_v.o + +kernel_phxy_times_phxy_equals_phxy.o: kernel_phxy_times_phxy_equals_phxy.cu kernel_phxy_times_phxy_equals_phxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_times_phxy_equals_phxy.cu -o kernel_phxy_times_phxy_equals_phxy.o + +kernel_phxy_plus_phxy.o: kernel_phxy_plus_phxy.cu kernel_phxy_plus_phxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_plus_phxy.cu -o kernel_phxy_plus_phxy.o + +kernel_phxy_plus_pxy.o: kernel_phxy_plus_pxy.cu kernel_phxy_plus_pxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_plus_pxy.cu -o kernel_phxy_plus_pxy.o + +kernel_phxy_times_pxy.o: kernel_phxy_times_pxy.cu kernel_phxy_times_pxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_times_pxy.cu -o kernel_phxy_times_pxy.o + +kernel_phxy_fill_with_h.o: kernel_phxy_fill_with_h.cu kernel_phxy_fill_with_h.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_fill_with_h.cu -o kernel_phxy_fill_with_h.o + +kernel_helper_functions.o: kernel_helper_functions.cu kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_helper_functions.cu -o kernel_helper_functions.o + +kernel_phxy_one_over_sum_into_pxy.o: kernel_phxy_one_over_sum_into_pxy.cu kernel_phxy_one_over_sum_into_pxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_one_over_sum_into_pxy.cu -o kernel_phxy_one_over_sum_into_pxy.o + +kernel_phxy_fill_with_spike_selected_w.o: kernel_phxy_fill_with_spike_selected_w.cu kernel_phxy_fill_with_spike_selected_w.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_fill_with_spike_selected_w.cu -o kernel_phxy_fill_with_spike_selected_w.o + +kernel_spike_generation.o: kernel_spike_generation.cu kernel_spike_generation.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_spike_generation.cu -o kernel_spike_generation.o + +kernel_approximation_multiplication.o: \ + kernel_approximation_multiplication.cu \ + kernel_approximation_multiplication.h \ + kernel_helper_functions.h \ + kernel_approximation_error_term.cu + $(NVCC) $(PARAMETERS_O) -c kernel_approximation_multiplication.cu -o kernel_approximation_multiplication.o + +approximation_multiplication_function.o: \ + approximation_multiplication_function.cpp \ + approximation_multiplication_function.h \ + error_term.h + $(NVCC) $(PARAMETERS_O) -c approximation_multiplication_function.cpp -o approximation_multiplication_function.o + +error_term.o: error_term.cpp error_term.h + $(NVCC) $(PARAMETERS_O) -c error_term.cpp -o error_term.o + +kernel_pxy_times_spike_selected_sxy.o: kernel_pxy_times_spike_selected_sxy.cu kernel_pxy_times_spike_selected_sxy.h kernel_helper_functions.h + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_times_spike_selected_sxy.cu -o kernel_pxy_times_spike_selected_sxy.o + + +# .o: .cu .h kernel_helper_functions.h +# $(NVCC) $(PARAMETERS_O) -c .cu -o .o + + diff --git a/network/CPP_Cuda_new_preview/MultiApp.cu b/network/CPP_Cuda_new_preview/MultiApp.cu new file mode 100644 index 0000000..e39d36b --- /dev/null +++ b/network/CPP_Cuda_new_preview/MultiApp.cu @@ -0,0 +1,273 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "MultiApp.h" +#include "approximation_multiplication_function.h" +#include "kernel_approximation_multiplication.h" + +MultiApp::MultiApp(){ + +}; + +MultiApp::~MultiApp(){ + +}; + +bool MultiApp::update(float* np_input_pointer, float* np_weight_pointer, + float* np_output_pointer, int64_t pattern_dim, + int64_t feature_dim, int64_t x_dim, int64_t y_dim, + int64_t input_channel_dim, int64_t id_pattern, + bool approximation_enable, int64_t number_of_trunc_bits, + int64_t number_of_frac_bits) { + assert((id_pattern >= 0)); + assert((id_pattern < pattern_dim)); + + float* np_input_pointer_pattern; + float* np_output_pointer_pattern; + + float* input_ptr; + float* output_ptr; + float* w_ptr; + + uint64_t pattern_size = input_channel_dim; + + std::vector ap_h_vector; + ap_h_vector.resize(pattern_size); + float* ap_h_ptr = ap_h_vector.data(); + + std::vector ap_x_vector; + ap_x_vector.resize(pattern_size); + uint32_t* ap_x_ptr = ap_x_vector.data(); + + std::vector ap_y_vector; + ap_y_vector.resize(pattern_size); + uint32_t* ap_y_ptr = ap_y_vector.data(); + + std::vector ap_x_exponent_vector; + ap_x_exponent_vector.resize(pattern_size); + uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data(); + + std::vector ap_y_exponent_vector; + ap_y_exponent_vector.resize(pattern_size); + uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data(); + + std::vector ap_h_exponent_vector; + ap_h_exponent_vector.resize(pattern_size); + uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data(); + + std::vector ap_res_vector; + ap_res_vector.resize(pattern_size); + uint64_t* ap_res_ptr = ap_res_vector.data(); + + uint32_t ap_mask = static_cast(pow(2, number_of_trunc_bits)) - 1; + + std::vector sign_temp_vector; + sign_temp_vector.resize(pattern_size); + uint32_t* sign_temp_ptr = sign_temp_vector.data(); + + uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim; + uint64_t output_pattern_size = feature_dim * x_dim * y_dim; + + np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size; + np_output_pointer_pattern = + np_output_pointer + id_pattern * output_pattern_size; + + uint64_t counter; + + uint64_t counter_x; + uint64_t counter_y; + uint64_t counter_feature; + uint64_t pos_xy; + uint64_t pos_xy_if; + + float temp_sum; + + uint64_t pattern_c_2 = x_dim * y_dim; + + for (counter_x = 0; counter_x < x_dim; counter_x++) { + for (counter_y = 0; counter_y < y_dim; counter_y++) { + pos_xy = counter_y + counter_x * y_dim; + for (counter_feature = 0; counter_feature < feature_dim; + counter_feature++) { + pos_xy_if = counter_feature * pattern_c_2 + pos_xy; + + input_ptr = np_input_pointer_pattern + pos_xy; + output_ptr = np_output_pointer_pattern + pos_xy_if; + w_ptr = np_weight_pointer + counter_feature * input_channel_dim; + +#pragma omp simd + for (counter = 0; counter < pattern_size; counter++) { + ap_h_ptr[counter] = input_ptr[counter * pattern_c_2]; + } + + approximation_multiplication_function( + ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits, + number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr, + ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr, + sign_temp_ptr, approximation_enable); + + temp_sum = 0.0; +#pragma omp simd reduction(+ : temp_sum) + for (counter = 0; counter < pattern_size; counter++) { + temp_sum += ap_h_ptr[counter]; + } + + output_ptr[0] = temp_sum; + } + } + } + + return true; +}; + +bool MultiApp::update_entrypoint( + int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr, + int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim, + int64_t x_dim, int64_t y_dim, int64_t input_channel_dim, + int64_t number_of_processes, bool approximation_enable, + int64_t number_of_trunc_bits, int64_t number_of_frac) { + int64_t number_of_pattern = pattern_dim; + int64_t pattern_id; + + float* np_input_pointer = (float*)np_input_pointer_addr; + float* np_weight_pointer = (float*)np_weight_pointer_addr; + float* np_output_pointer = (float*)np_output_pointer_addr; + + assert((np_input_pointer != nullptr)); + assert((np_output_pointer != nullptr)); + assert((np_weight_pointer != nullptr)); + + assert((pattern_dim > 0)); + assert((feature_dim > 0)); + assert((x_dim > 0)); + assert((y_dim > 0)); + assert((input_channel_dim > 0)); + + if (number_of_processes > 0) { + omp_set_num_threads(number_of_processes); + // For debugging: Only one thread + // omp_set_num_threads(1); + +#pragma omp parallel for + for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) { + update(np_input_pointer, np_weight_pointer, np_output_pointer, + pattern_dim, feature_dim, x_dim, y_dim, input_channel_dim, + pattern_id, approximation_enable, number_of_trunc_bits, + number_of_frac); + } + } else { + update_gpu(np_input_pointer, np_weight_pointer, np_output_pointer, + pattern_dim, feature_dim, x_dim, y_dim, input_channel_dim, + approximation_enable, number_of_trunc_bits, number_of_frac); + } + return true; +}; + +void MultiApp::gpu_occupancy_measure(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim) { + grid_and_thread_calculated = false; + assert((dim_x < 65535)); + assert((dim_y < 65535)); + + grid_and_thread_settings.resize(1); + + occupancy_kernel_approximation_multiplication( + dim_x, dim_y, number_of_pattern, h_dim, grid_and_thread_settings[0], + display_debug); + + grid_and_thread_calculated = true; + return; +}; + +void MultiApp::gpu_occupancy_export(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) { + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS)); + + gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim); + assert((grid_and_thread_calculated == true)); + + assert((setting_dim_0 == grid_and_thread_settings.size())); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) { + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) { + setting_memory[counter_0 * setting_dim_1 + counter_1] = + grid_and_thread_settings[counter_0][counter_1]; + } + } +}; + +void MultiApp::gpu_occupancy_import(int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) { + grid_and_thread_calculated = false; + + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS)); + assert((setting_dim_0 == APPROXI_MULTI_NUMBER_OF_KERNELS)); + + grid_and_thread_settings.resize(APPROXI_MULTI_NUMBER_OF_KERNELS); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) { + grid_and_thread_settings[counter_0].resize( + APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS); + + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) { + grid_and_thread_settings[counter_0][counter_1] = + setting_memory[counter_0 * setting_dim_1 + counter_1]; + } + } + + grid_and_thread_calculated = true; +}; + +void MultiApp::update_gpu(float* np_input_pointer, float* np_weight_pointer, + float* np_output_pointer, uint64_t pattern_dim, + uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, + uint64_t input_channel_dim, bool approximation_enable, + uint64_t number_of_trunc_bits, + uint64_t number_of_frac_bits) { + if (grid_and_thread_calculated == false) { + gpu_occupancy_measure(x_dim, y_dim, pattern_dim, feature_dim); + } + assert((grid_and_thread_calculated == true)); + + uint32_t ap_mask = static_cast(pow(2, number_of_trunc_bits)) - 1; + // std::cout << approximation_enable << std::endl; + // std::cout << number_of_trunc_bits << std::endl; + // std::cout << number_of_frac_bits << std::endl; + + cudaError_t status; + + size_t pfxy_block_dim_c0 = feature_dim * x_dim * y_dim; + size_t pfxy_block_dim_c1 = x_dim * y_dim; + size_t pfxy_block_dim_c2 = y_dim; + + kernel_approximation_multiplication<<< + dim3(grid_and_thread_settings[0][0], grid_and_thread_settings[0][1], + grid_and_thread_settings[0][2]), + dim3(grid_and_thread_settings[0][3], grid_and_thread_settings[0][4], + grid_and_thread_settings[0][5])>>>( + np_input_pointer, np_weight_pointer, np_output_pointer, pattern_dim, + feature_dim, x_dim, y_dim, input_channel_dim, + grid_and_thread_settings[0][6], (x_dim * y_dim), number_of_frac_bits, + approximation_enable, number_of_trunc_bits, ap_mask, pfxy_block_dim_c0, + pfxy_block_dim_c1, pfxy_block_dim_c2); + + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; diff --git a/network/CPP_Cuda_new_preview/MultiApp.h b/network/CPP_Cuda_new_preview/MultiApp.h new file mode 100644 index 0000000..777612b --- /dev/null +++ b/network/CPP_Cuda_new_preview/MultiApp.h @@ -0,0 +1,60 @@ +#ifndef MULTIAPP +#define MULTIAPP + +#include + +#include +#include +#include + +#define APPROXI_MULTI_NUMBER_OF_KERNELS 1 +#define APPROXI_MULTI_NUMBER_OF_KERNELS_PARAMETERS 7 + +class MultiApp +{ + public: + MultiApp(); + ~MultiApp(); + + + bool update_entrypoint( + int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr, + int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim, + int64_t x_dim, int64_t y_dim, int64_t input_channel_dim, + int64_t number_of_processes, bool approximation_enable, + int64_t number_of_trunc_bits, int64_t number_of_frac); + + void gpu_occupancy_export(size_t dim_x, size_t dim_y, size_t number_of_pattern, + size_t h_dim, int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1); + + void gpu_occupancy_import( + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1); + + private: + + bool update(float* np_input_pointer, float* np_weight_pointer, + float* np_output_pointer, int64_t pattern_dim, + int64_t feature_dim, int64_t x_dim, int64_t y_dim, + int64_t input_channel_dim, int64_t id_pattern, + bool approximation_enable, int64_t number_of_trunc_bits, + int64_t number_of_frac); + + void update_gpu(float* input_pointer, float* weight_pointer, + float* output_pointer, uint64_t pattern_dim, + uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, + uint64_t input_channel_dim, + bool approximation_enable, uint64_t number_of_trunc_bits, + uint64_t number_of_frac); + + void gpu_occupancy_measure(size_t dim_x, size_t dim_y, size_t number_of_pattern, + size_t h_dim); + + bool grid_and_thread_calculated = false; + std::vector> grid_and_thread_settings; + bool display_debug = false; + +}; + +#endif /* MULTIAPP */ diff --git a/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.cpp b/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.cpp new file mode 100644 index 0000000..469a4a8 --- /dev/null +++ b/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.cpp @@ -0,0 +1,18 @@ +#include + +#include "HDynamicCNNManyIP.h" + +namespace py = pybind11; + +PYBIND11_MODULE(PyHDynamicCNNManyIP, m) +{ + m.doc() = "HDynamicCNNManyIP Module"; + py::class_(m, "HDynamicCNNManyIP") + .def(py::init<>()) + .def("gpu_occupancy_export", + &HDynamicCNNManyIP::gpu_occupancy_export) + .def("gpu_occupancy_import", + &HDynamicCNNManyIP::gpu_occupancy_import) + .def("update", + &HDynamicCNNManyIP::update_entrypoint); +} \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.pyi b/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.pyi new file mode 100644 index 0000000..e300ba6 --- /dev/null +++ b/network/CPP_Cuda_new_preview/PyHDynamicCNNManyIP.pyi @@ -0,0 +1,20 @@ +# +# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! +# + +"""HDynamicCNNManyIP Module""" +from __future__ import annotations +import PyHDynamicCNNManyIP +import typing + +__all__ = [ + "HDynamicCNNManyIP" +] + + +class HDynamicCNNManyIP(): + def __init__(self) -> None: ... + def gpu_occupancy_export(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int) -> None: ... + def gpu_occupancy_import(self, arg0: int, arg1: int, arg2: int) -> None: ... + def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ... + pass diff --git a/network/CPP_Cuda_new_preview/PyMultiApp.cpp b/network/CPP_Cuda_new_preview/PyMultiApp.cpp new file mode 100644 index 0000000..52fba1e --- /dev/null +++ b/network/CPP_Cuda_new_preview/PyMultiApp.cpp @@ -0,0 +1,15 @@ + +#include + +#include "MultiApp.h" + +namespace py = pybind11; + +PYBIND11_MODULE(PyMultiApp, m) { + m.doc() = "MultiApp Module"; + py::class_(m, "MultiApp") + .def(py::init<>()) + .def("gpu_occupancy_export", &MultiApp::gpu_occupancy_export) + .def("gpu_occupancy_import", &MultiApp::gpu_occupancy_import) + .def("update_entrypoint", &MultiApp::update_entrypoint); +} \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/PyMultiApp.pyi b/network/CPP_Cuda_new_preview/PyMultiApp.pyi new file mode 100644 index 0000000..2850407 --- /dev/null +++ b/network/CPP_Cuda_new_preview/PyMultiApp.pyi @@ -0,0 +1,20 @@ +# +# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! +# + +"""MultiApp Module""" +from __future__ import annotations +import PyMultiApp +import typing + +__all__ = [ + "MultiApp" +] + + +class MultiApp(): + def __init__(self) -> None: ... + def gpu_occupancy_export(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int) -> None: ... + def gpu_occupancy_import(self, arg0: int, arg1: int, arg2: int) -> None: ... + def update_entrypoint(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ... + pass diff --git a/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.cpp b/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.cpp new file mode 100644 index 0000000..b338d23 --- /dev/null +++ b/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.cpp @@ -0,0 +1,19 @@ + +#include + +#include "SpikeGeneration2DManyIP.h" + +namespace py = pybind11; + +PYBIND11_MODULE(PySpikeGeneration2DManyIP, m) +{ + m.doc() = "SpikeGeneration2DManyIP Module"; + py::class_(m, "SpikeGeneration2DManyIP") + .def(py::init<>()) + .def("gpu_occupancy_export", + &SpikeGeneration2DManyIP::gpu_occupancy_export) + .def("gpu_occupancy_import", + &SpikeGeneration2DManyIP::gpu_occupancy_import) + .def("spike_generation", + &SpikeGeneration2DManyIP::spike_generation_entrypoint); +} diff --git a/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.pyi b/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.pyi new file mode 100644 index 0000000..1119cb6 --- /dev/null +++ b/network/CPP_Cuda_new_preview/PySpikeGeneration2DManyIP.pyi @@ -0,0 +1,20 @@ +# +# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! +# + +"""SpikeGeneration2DManyIP Module""" +from __future__ import annotations +import PySpikeGeneration2DManyIP +import typing + +__all__ = [ + "SpikeGeneration2DManyIP" +] + + +class SpikeGeneration2DManyIP(): + def __init__(self) -> None: ... + def gpu_occupancy_export(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int) -> None: ... + def gpu_occupancy_import(self, arg0: int, arg1: int, arg2: int) -> None: ... + def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ... + pass diff --git a/network/CPP_Cuda_new_preview/PyTestKernel.cpp b/network/CPP_Cuda_new_preview/PyTestKernel.cpp new file mode 100644 index 0000000..148709d --- /dev/null +++ b/network/CPP_Cuda_new_preview/PyTestKernel.cpp @@ -0,0 +1,37 @@ + +#include + +#include "TestKernel.h" + +namespace py = pybind11; + +PYBIND11_MODULE(PyTestKernel, m) { + m.doc() = "TestKernel Module"; + py::class_(m, "TestKernel") + .def(py::init<>()) + + .def("test_kernel_pxy_times_spike_selected_sxy", + &TestKernel::test_kernel_pxy_times_spike_selected_sxy) + + .def("test_kernel_phxy_fill_with_spike_selected_w", + &TestKernel::test_kernel_phxy_fill_with_spike_selected_w) + .def("test_kernel_phxy_plus_pxy", &TestKernel::test_kernel_phxy_plus_pxy) + .def("test_kernel_phxy_fill_with_h", + &TestKernel::test_kernel_phxy_fill_with_h) + .def("test_kernel_phxy_times_pxy", + &TestKernel::test_kernel_phxy_times_pxy) + .def("test_kernel_phxy_one_over_sum_into_pxy", + &TestKernel::test_kernel_phxy_one_over_sum_into_pxy) + + .def("test_kernel_phxy_plus_phxy", + &TestKernel::test_kernel_phxy_plus_phxy) + .def("test_kernel_phxy_times_phxy_equals_phxy", + &TestKernel::test_kernel_phxy_times_phxy_equals_phxy) + + .def("test_kernel_pxy_time_pxy", &TestKernel::test_kernel_pxy_time_pxy) + .def("test_kernel_pxy_reciprocal", + &TestKernel::test_kernel_pxy_reciprocal) + .def("test_kernel_pxy_plus_v", &TestKernel::test_kernel_pxy_plus_v) + .def("test_kernel_pxy_times_v", &TestKernel::test_kernel_pxy_times_v) + .def("test_kernel_pxy_set_to_v", &TestKernel::test_kernel_pxy_set_to_v); +} \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/PyTestKernel.pyi b/network/CPP_Cuda_new_preview/PyTestKernel.pyi new file mode 100644 index 0000000..bc4a378 --- /dev/null +++ b/network/CPP_Cuda_new_preview/PyTestKernel.pyi @@ -0,0 +1,30 @@ +# +# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! +# + +"""TestKernel Module""" +from __future__ import annotations +import PyTestKernel +import typing + +__all__ = [ + "TestKernel" +] + + +class TestKernel(): + def __init__(self) -> None: ... + def test_kernel_phxy_fill_with_h(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int) -> None: ... + def test_kernel_phxy_fill_with_spike_selected_w(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> None: ... + def test_kernel_phxy_one_over_sum_into_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int) -> None: ... + def test_kernel_phxy_plus_phxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int) -> None: ... + def test_kernel_phxy_plus_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int) -> None: ... + def test_kernel_phxy_times_phxy_equals_phxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int) -> None: ... + def test_kernel_phxy_times_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int) -> None: ... + def test_kernel_pxy_plus_v(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: float, arg6: int) -> None: ... + def test_kernel_pxy_reciprocal(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int) -> None: ... + def test_kernel_pxy_set_to_v(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: float, arg6: int) -> None: ... + def test_kernel_pxy_time_pxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int) -> None: ... + def test_kernel_pxy_times_spike_selected_sxy(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> None: ... + def test_kernel_pxy_times_v(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: bool, arg5: float, arg6: int) -> None: ... + pass diff --git a/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.cu b/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.cu new file mode 100644 index 0000000..ab8e0a0 --- /dev/null +++ b/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.cu @@ -0,0 +1,261 @@ +#include +#include +#include + +#include +#include +#include + +#include "SpikeGeneration2DManyIP.h" +#include "kernel_spike_generation.h" + +SpikeGeneration2DManyIP::SpikeGeneration2DManyIP(){ + +}; + +SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP(){ + +}; + +bool SpikeGeneration2DManyIP::spike_generation_entrypoint( + int64_t input_pointer_addr, int64_t input_dim_0, int64_t input_dim_1, + int64_t input_dim_2, int64_t input_dim_3, + int64_t random_values_pointer_addr, int64_t random_values_dim_0, + int64_t random_values_dim_1, int64_t random_values_dim_2, + int64_t random_values_dim_3, int64_t output_pointer_addr, + int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2, + int64_t output_dim_3, int64_t number_of_cpu_processes) { + float* input_pointer = (float*)input_pointer_addr; + float* random_values_pointer = (float*)random_values_pointer_addr; + int64_t* output_pointer = (int64_t*)output_pointer_addr; + + // Input + assert((input_pointer != nullptr)); + assert((input_dim_0 > 0)); + assert((input_dim_1 > 0)); + assert((input_dim_2 > 0)); + assert((input_dim_3 > 0)); + + // Random + assert((random_values_pointer != nullptr)); + assert((random_values_dim_0 > 0)); + assert((random_values_dim_1 > 0)); + assert((random_values_dim_2 > 0)); + assert((random_values_dim_3 > 0)); + + // Output + assert((output_pointer != nullptr)); + assert((output_dim_0 > 0)); + assert((output_dim_1 > 0)); + assert((output_dim_2 > 0)); + assert((output_dim_3 > 0)); + + // Input + size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3; + size_t input_dim_c1 = input_dim_2 * input_dim_3; + size_t input_dim_c2 = input_dim_3; + + // Random + size_t random_values_dim_c0 = + random_values_dim_1 * random_values_dim_2 * random_values_dim_3; + size_t random_values_dim_c1 = random_values_dim_2 * random_values_dim_3; + size_t random_values_dim_c2 = random_values_dim_3; + + // Output + size_t output_dim_c0 = output_dim_1 * output_dim_2 * output_dim_3; + size_t output_dim_c1 = output_dim_2 * output_dim_3; + size_t output_dim_c2 = output_dim_3; + + size_t number_of_pattern = input_dim_0; + size_t h_dim = input_dim_1; + size_t spike_dim = output_dim_1; + size_t x_dim = output_dim_2; + size_t y_dim = output_dim_2; + + if (number_of_cpu_processes > 0) { + omp_set_num_threads(number_of_cpu_processes); + // DEBUG: + // omp_set_num_threads(1); + + size_t pattern_id; + +#pragma omp parallel for + for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) { + spike_generation( + input_pointer, input_dim_c0, input_dim_c1, input_dim_c2, + random_values_pointer, random_values_dim_c0, random_values_dim_c1, + random_values_dim_c2, output_pointer, output_dim_c0, output_dim_c1, + output_dim_c2, x_dim, y_dim, spike_dim, h_dim, pattern_id); + } + } else { + gpu_spike_generation( + input_pointer, input_dim_c0, input_dim_c1, input_dim_c2, + random_values_pointer, random_values_dim_c0, random_values_dim_c1, + random_values_dim_c2, output_pointer, output_dim_c0, output_dim_c1, + output_dim_c2, x_dim, y_dim, spike_dim, h_dim, number_of_pattern); + } + + return true; +}; + +bool SpikeGeneration2DManyIP::spike_generation( + float* input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, float* random_values_pointer, + size_t random_values_dim_c0, size_t random_values_dim_c1, + size_t random_values_dim_c2, int64_t* output_pointer, size_t output_dim_c0, + size_t output_dim_c1, size_t output_dim_c2, size_t x_dim, size_t y_dim, + size_t spike_dim, size_t h_dim, size_t pattern_id) { + size_t counter; + size_t counter_x = 0; + size_t counter_y = 0; + + float* p_ptr = nullptr; + int64_t* out_ptr = nullptr; + float* rand_ptr = nullptr; + + for (counter_x = 0; counter_x < x_dim; counter_x++) { + for (counter_y = 0; counter_y < y_dim; counter_y++) { + p_ptr = input_pointer + pattern_id * input_dim_c0 + + counter_x * input_dim_c2 + counter_y; + // + counter * input_dim_c1 + + out_ptr = output_pointer + pattern_id * output_dim_c0 + + counter_x * output_dim_c2 + counter_y; + // + counter * output_dim_c1 + + rand_ptr = random_values_pointer + pattern_id * random_values_dim_c0 + + counter_x * random_values_dim_c2 + counter_y; + // + counter * random_values_dim_c1 + + for (counter = 0; counter < spike_dim; counter++) { + out_ptr[counter * output_dim_c1] = + lower_bound(p_ptr, h_dim, input_dim_c1, + rand_ptr[counter * random_values_dim_c1]); + } + } + } + + return true; +}; + +// algorithmic idea stolen from libc++ +size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr, size_t data_length, + size_t data_ptr_stride, + float compare_to_value) { + size_t start_of_range = 0; + size_t length_of_range = data_length; + + while (length_of_range != 0) { + size_t half_length = length_of_range >> 1; + size_t actual_position = start_of_range + half_length; + + if (data_ptr[actual_position * data_ptr_stride] < compare_to_value) { + start_of_range = ++actual_position; + length_of_range -= half_length + 1; + } else + length_of_range = half_length; + } + return start_of_range; +}; + +void SpikeGeneration2DManyIP::gpu_occupancy_measure(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t spike_dim) { + grid_and_thread_calculated = false; + assert((dim_x < 65535)); + assert((dim_y < 65535)); + + grid_and_thread_settings.resize(1); + + occupancy_kernel_spike_generation(dim_x, dim_y, number_of_pattern, spike_dim, + grid_and_thread_settings[0], display_debug); + + grid_and_thread_calculated = true; + return; +}; + +void SpikeGeneration2DManyIP::gpu_occupancy_export( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t spike_dim, + int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1) { + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS)); + + gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, spike_dim); + assert((grid_and_thread_calculated == true)); + assert( + (grid_and_thread_settings.size() == SPIKE_GENERATION_NUMBER_OF_KERNELS)); + + assert((setting_dim_0 == grid_and_thread_settings.size())); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) { + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) { + setting_memory[counter_0 * setting_dim_1 + counter_1] = + grid_and_thread_settings[counter_0][counter_1]; + } + } +}; + +void SpikeGeneration2DManyIP::gpu_occupancy_import(int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) { + grid_and_thread_calculated = false; + + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS)); + assert((setting_dim_0 == SPIKE_GENERATION_NUMBER_OF_KERNELS)); + + grid_and_thread_settings.resize(SPIKE_GENERATION_NUMBER_OF_KERNELS); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) { + grid_and_thread_settings[counter_0].resize( + SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS); + + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) { + grid_and_thread_settings[counter_0][counter_1] = + setting_memory[counter_0 * setting_dim_1 + counter_1]; + } + } + + grid_and_thread_calculated = true; +}; + +bool SpikeGeneration2DManyIP::gpu_spike_generation( + float* input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, float* random_values_pointer, + size_t random_values_dim_c0, size_t random_values_dim_c1, + size_t random_values_dim_c2, int64_t* output_pointer, size_t output_dim_c0, + size_t output_dim_c1, size_t output_dim_c2, size_t x_dim, size_t y_dim, + size_t spike_dim, size_t h_dim, size_t number_of_pattern) { + if (grid_and_thread_calculated == false) { + gpu_occupancy_measure(x_dim, y_dim, number_of_pattern, spike_dim); + } + assert((grid_and_thread_calculated == true)); + + cudaError_t status; + assert((x_dim < 65535)); + assert((y_dim < 65535)); + + size_t psxy_block_dim_c0 = spike_dim * x_dim * y_dim; + size_t psxy_block_dim_c1 = x_dim * y_dim; + size_t psxy_block_dim_c2 = y_dim; + + kernel_spike_generation<<< + dim3(grid_and_thread_settings[0][0], grid_and_thread_settings[0][1], + grid_and_thread_settings[0][2]), + dim3(grid_and_thread_settings[0][3], grid_and_thread_settings[0][4], + grid_and_thread_settings[0][5])>>>( + input_pointer, input_dim_c0, input_dim_c1, input_dim_c2, + random_values_pointer, random_values_dim_c0, random_values_dim_c1, + random_values_dim_c2, output_pointer, output_dim_c0, output_dim_c1, + output_dim_c2, x_dim, y_dim, spike_dim, h_dim, psxy_block_dim_c0, + psxy_block_dim_c1, psxy_block_dim_c2, grid_and_thread_settings[0][6]); + + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + return true; +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.h b/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.h new file mode 100644 index 0000000..b3a8650 --- /dev/null +++ b/network/CPP_Cuda_new_preview/SpikeGeneration2DManyIP.h @@ -0,0 +1,87 @@ +#ifndef SPIKEGENERATION2DMANYIP +#define SPIKEGENERATION2DMANYIP + +#include + +#include +#include +#include + +#define SPIKE_GENERATION_NUMBER_OF_KERNELS 1 +#define SPIKE_GENERATION_NUMBER_OF_KERNELS_PARAMETERS 7 + +class SpikeGeneration2DManyIP +{ + public: + SpikeGeneration2DManyIP(); + ~SpikeGeneration2DManyIP(); + + bool spike_generation_entrypoint( + int64_t input_pointer_addr, int64_t input_dim_0, + int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3, + int64_t random_values_pointer_addr, int64_t random_values_dim_0, + int64_t random_values_dim_1, int64_t random_values_dim_2, + int64_t random_values_dim_3, int64_t output_pointer_addr, + int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2, + int64_t output_dim_3, int64_t number_of_cpu_processes); + + bool spike_generation( + float* input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + float* random_values_pointer, + size_t random_values_dim_c0, + size_t random_values_dim_c1, + size_t random_values_dim_c2, + int64_t* output_pointer, + size_t output_dim_c0, + size_t output_dim_c1, + size_t output_dim_c2, + size_t x_dim, + size_t y_dim, + size_t spike_dim, + size_t h_dim, + size_t pattern_id); + + bool gpu_spike_generation( + float* input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + float* random_values_pointer, + size_t random_values_dim_c0, + size_t random_values_dim_c1, + size_t random_values_dim_c2, + int64_t* output_pointer, + size_t output_dim_c0, + size_t output_dim_c1, + size_t output_dim_c2, + size_t x_dim, + size_t y_dim, + size_t spike_dim, + size_t h_dim, + size_t number_of_pattern); + + void gpu_occupancy_export(size_t dim_x, size_t dim_y, size_t number_of_pattern, + size_t spike_dim, int64_t setting_memory_addr, size_t setting_dim_0, size_t setting_dim_1); + + void gpu_occupancy_import( + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1); + + private: + size_t lower_bound(float* data_ptr, size_t data_length, + size_t data_ptr_stride, + float compare_to_value); + + void gpu_occupancy_measure(size_t dim_x, size_t dim_y, size_t number_of_pattern, + size_t spike_dim); + + bool grid_and_thread_calculated = false; + std::vector> grid_and_thread_settings; + bool display_debug = false; +}; + +#endif /* SPIKEGENERATION2DMANYIP */ diff --git a/network/CPP_Cuda_new_preview/TestKernel.cu b/network/CPP_Cuda_new_preview/TestKernel.cu new file mode 100644 index 0000000..0ba16bb --- /dev/null +++ b/network/CPP_Cuda_new_preview/TestKernel.cu @@ -0,0 +1,329 @@ +#include +#include +#include + +#include "TestKernel.h" +#include "kernel_phxy_fill_with_h.h" +#include "kernel_phxy_fill_with_spike_selected_w.h" +#include "kernel_phxy_one_over_sum_into_pxy.h" +#include "kernel_phxy_plus_phxy.h" +#include "kernel_phxy_plus_pxy.h" +#include "kernel_phxy_times_phxy_equals_phxy.h" +#include "kernel_phxy_times_pxy.h" +#include "kernel_pxy_plus_v.h" +#include "kernel_pxy_reciprocal.h" +#include "kernel_pxy_set_to_v.h" +#include "kernel_pxy_time_pxy.h" +#include "kernel_pxy_times_spike_selected_sxy.h" +#include "kernel_pxy_times_v.h" + +TestKernel::TestKernel(){ + +}; + +TestKernel::~TestKernel(){ + +}; + +void TestKernel::test_kernel_pxy_times_spike_selected_sxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, int64_t pxy_memory_addr, int64_t sxy_memory_addr, + int64_t spike_memory_addr, size_t spike_time, size_t spike_dim_c0, + size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1) { + float* pxy_memory = (float*)pxy_memory_addr; + float* sxy_memory = (float*)sxy_memory_addr; + int64_t* spike_memory = (int64_t*)spike_memory_addr; + + std::vector setting; + occupancy_kernel_pxy_times_spike_selected_sxy(dim_x, dim_y, number_of_pattern, + h_dim, setting, display_debug); + + size_t pxy_block_dim_c0 = dim_x * dim_y; + size_t pxy_block_dim_c1 = dim_y; + + kernel_pxy_times_spike_selected_sxy<<< + dim3(setting[0], setting[1], setting[2]), + dim3(setting[3], setting[4], setting[5])>>>( + pxy_memory, sxy_memory, spike_memory, spike_time, spike_dim_c0, + spike_dim_c1, spike_dim_c2, pxy_dim_c0, pxy_dim_c1, sxy_dim_c0, + sxy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, bool display_debug, + int64_t phxy_memory_a_addr, + int64_t phxy_memory_b_addr) { + float* phxy_memory_a = (float*)phxy_memory_a_addr; + float* phxy_memory_b = (float*)phxy_memory_b_addr; + + std::vector setting; + occupancy_kernel_phxy_plus_phxy(dim_x, dim_y, number_of_pattern, h_dim, + setting, display_debug); + + kernel_phxy_plus_phxy<<>>( + phxy_memory_a, phxy_memory_b, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, float value, + int64_t pxy_memory_addr) { + float* pxy_memory = (float*)pxy_memory_addr; + + std::vector setting; + occupancy_kernel_pxy_times_v(dim_x, dim_y, number_of_pattern, h_dim, setting, + display_debug); + + kernel_pxy_times_v<<>>( + pxy_memory, value, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_fill_with_spike_selected_w( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t spike_time, size_t weights_dim_c0, + size_t spike_dim_c0, size_t spike_dim_c1, size_t spike_dim_c2, + size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, + int64_t phxy_memory_addr, int64_t weight_memory_addr, + int64_t spike_memory_addr) { + float* phxy_memory = (float*)phxy_memory_addr; + float* weight_memory = (float*)weight_memory_addr; + int64_t* spike_memory = (int64_t*)spike_memory_addr; + + std::vector setting; + occupancy_kernel_phxy_fill_with_spike_selected_w( + dim_x, dim_y, number_of_pattern, h_dim, setting, display_debug); + + size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y; + size_t phxy_block_dim_c1 = dim_x * dim_y; + size_t phxy_block_dim_c2 = dim_y; + + kernel_phxy_fill_with_spike_selected_w<<< + dim3(setting[0], setting[1], setting[2]), + dim3(setting[3], setting[4], setting[5])>>>( + phxy_memory, weight_memory, spike_memory, spike_time, weights_dim_c0, + spike_dim_c0, spike_dim_c1, spike_dim_c2, phxy_dim_c0, phxy_dim_c1, + phxy_dim_c2, h_dim, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_times_phxy_equals_phxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, int64_t phxy_memory_a_addr, int64_t phxy_memory_b_addr, + int64_t phxy_memory_out_addr) { + float* phxy_memory_a = (float*)phxy_memory_a_addr; + float* phxy_memory_b = (float*)phxy_memory_b_addr; + float* phxy_memory_out = (float*)phxy_memory_out_addr; + + std::vector setting; + occupancy_kernel_phxy_times_phxy_equals_phxy(dim_x, dim_y, number_of_pattern, + h_dim, setting, display_debug); + + kernel_phxy_times_phxy_equals_phxy<<>>( + phxy_memory_a, phxy_memory_b, phxy_memory_out, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, float value, + int64_t pxy_memory_addr) { + float* pxy_memory = (float*)pxy_memory_addr; + + std::vector setting; + occupancy_kernel_pxy_plus_v(dim_x, dim_y, number_of_pattern, h_dim, setting, + display_debug); + + kernel_pxy_plus_v<<>>( + pxy_memory, value, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, bool display_debug, + int64_t pxy_memory_a_addr, + int64_t pxy_memory_b_addr) { + float* pxy_memory_a = (float*)pxy_memory_a_addr; + float* pxy_memory_b = (float*)pxy_memory_b_addr; + + std::vector setting; + occupancy_kernel_pxy_time_pxy(dim_x, dim_y, number_of_pattern, h_dim, setting, + display_debug); + + kernel_pxy_time_pxy<<>>( + pxy_memory_a, pxy_memory_b, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_plus_pxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1, + int64_t phxy_memory_addr, int64_t pxy_memory_addr) { + float* phxy_memory = (float*)phxy_memory_addr; + float* pxy_memory = (float*)pxy_memory_addr; + std::vector setting; + occupancy_kernel_phxy_plus_pxy(dim_x, dim_y, number_of_pattern, h_dim, + setting, display_debug); + + size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y; + size_t phxy_block_dim_c1 = dim_x * dim_y; + size_t phxy_block_dim_c2 = dim_y; + + kernel_phxy_plus_pxy<<>>( + phxy_memory, pxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim, + pxy_dim_c0, pxy_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_one_over_sum_into_pxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1, + int64_t phxy_memory_addr, int64_t pxy_memory_addr) { + float* phxy_memory = (float*)phxy_memory_addr; + float* pxy_memory = (float*)pxy_memory_addr; + + std::vector setting; + occupancy_kernel_phxy_one_over_sum_into_pxy(dim_x, dim_y, number_of_pattern, + h_dim, setting, display_debug); + size_t pxy_block_dim_c0 = dim_x * dim_y; + size_t pxy_block_dim_c1 = dim_y; + + kernel_phxy_one_over_sum_into_pxy<<>>( + phxy_memory, pxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim, + pxy_dim_c0, pxy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, bool display_debug, + int64_t pxy_memory_addr) { + float* pxy_memory = (float*)pxy_memory_addr; + + std::vector setting; + occupancy_kernel_pxy_reciprocal(dim_x, dim_y, number_of_pattern, h_dim, + setting, display_debug); + kernel_pxy_reciprocal<<>>( + pxy_memory, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_fill_with_h( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, int64_t h_memory_addr, int64_t phxy_memory_addr) { + float* h_memory = (float*)h_memory_addr; + float* phxy_memory = (float*)phxy_memory_addr; + + std::vector setting; + occupancy_kernel_phxy_fill_with_h(dim_x, dim_y, number_of_pattern, h_dim, + setting, display_debug); + + size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y; + size_t phxy_block_dim_c1 = dim_x * dim_y; + size_t phxy_block_dim_c2 = dim_y; + + kernel_phxy_fill_with_h<<>>( + h_memory, phxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim, + phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_phxy_times_pxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1, + int64_t phxy_memory_addr, int64_t pxy_memory_addr) { + float* phxy_memory = (float*)phxy_memory_addr; + float* pxy_memory = (float*)pxy_memory_addr; + + std::vector setting; + occupancy_kernel_phxy_times_pxy(dim_x, dim_y, number_of_pattern, h_dim, + setting, display_debug); + + size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y; + size_t phxy_block_dim_c1 = dim_x * dim_y; + size_t phxy_block_dim_c2 = dim_y; + + kernel_phxy_times_pxy<<>>( + phxy_memory, pxy_memory, phxy_dim_c0, phxy_dim_c1, phxy_dim_c2, h_dim, + pxy_dim_c0, pxy_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; + +void TestKernel::test_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, bool display_debug, + float set_value, + int64_t pxy_memory_addr) { + float* pxy_memory = (float*)pxy_memory_addr; + + std::vector setting; + occupancy_kernel_pxy_set_to_v(dim_x, dim_y, number_of_pattern, h_dim, setting, + display_debug); + kernel_pxy_set_to_v<<>>( + pxy_memory, set_value, setting[6]); + + cudaError_t status; + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +}; diff --git a/network/CPP_Cuda_new_preview/TestKernel.h b/network/CPP_Cuda_new_preview/TestKernel.h new file mode 100644 index 0000000..d9ca667 --- /dev/null +++ b/network/CPP_Cuda_new_preview/TestKernel.h @@ -0,0 +1,89 @@ +class TestKernel { + public: + TestKernel(); + ~TestKernel(); + + void test_kernel_pxy_times_spike_selected_sxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, int64_t pxy_memory_addr, int64_t sxy_memory_addr, + int64_t spike_memory_addr, size_t spike_time, size_t spike_dim_c0, + size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1); + + // --- phxy + + void test_kernel_phxy_fill_with_spike_selected_w( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t spike_time, size_t weights_dim_c0, + size_t spike_dim_c0, size_t spike_dim_c1, size_t spike_dim_c2, + size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, + int64_t phxy_memory_addr, int64_t weight_memory_addr, + int64_t spike_memory_addr); + + void test_kernel_phxy_one_over_sum_into_pxy( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1, + int64_t phxy_memory_addr, int64_t pxy_memory_addr); + + void test_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, + size_t phxy_dim_c1, size_t phxy_dim_c2, + int64_t h_memory_addr, + int64_t phxy_memory_addr); + + void test_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, + size_t phxy_dim_c1, size_t phxy_dim_c2, + size_t pxy_dim_c0, size_t pxy_dim_c1, + int64_t phxy_memory_addr, + int64_t pxy_memory_addr); + + void test_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, size_t phxy_dim_c0, + size_t phxy_dim_c1, size_t phxy_dim_c2, + size_t pxy_dim_c0, size_t pxy_dim_c1, + int64_t phxy_memory_addr, + int64_t pxy_memory_addr); + + void test_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, bool display_debug, + int64_t phxy_memory_a_addr, + int64_t phxy_memory_b_addr, + int64_t phxy_memory_out_addr); + + void test_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, + int64_t phxy_memory_a_addr, + int64_t phxy_memory_b_addr); + + // --- pxy + void test_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, float value, + int64_t pxy_memory_addr); + + void test_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, int64_t pxy_memory_a_addr, + int64_t pxy_memory_b_addr); + + void test_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, float value, + int64_t pxy_memory_addr); + + void test_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, int64_t pxy_memory_addr); + + void test_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + bool display_debug, float value, + int64_t pxy_memory_addr); +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/approximation_multiplication_function.cpp b/network/CPP_Cuda_new_preview/approximation_multiplication_function.cpp new file mode 100644 index 0000000..05ac742 --- /dev/null +++ b/network/CPP_Cuda_new_preview/approximation_multiplication_function.cpp @@ -0,0 +1,165 @@ +#include + +#include +#include +#include + +#include "approximation_multiplication_function.h" +#include "error_term.h" + +// Best way to plot the bits +// std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n"; + +// The result needs to be written back into h_pointer (which contains h) +// Don't write to w_pointer. +void approximation_multiplication_function( + float* h_pointer, float* w_pointer, int64_t pattern_length, + uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits, + uint32_t* ap_x_ptr, uint32_t* ap_y_ptr, uint32_t* ap_x_exponent_ptr, + uint32_t* ap_y_exponent_ptr, uint32_t* ap_h_exponent_ptr, uint32_t ap_mask, + uint64_t* ap_res_ptr, uint32_t* sign_temp_ptr, bool approximation_enable) +{ + uint64_t counter; + + uint32_t* w_pointer_mod = (uint32_t*)w_pointer; + uint32_t* h_pointer_mod = (uint32_t*)h_pointer; + + // Calculate the new sign +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^ + (h_pointer_mod[counter] & 0x80000000); + } + + // Extract the exponent +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24; + } +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24; + } + + // Cast and "normalize" + uint64_t shift_value = 32 - number_of_frac_bits; +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_x_ptr[counter] = + ((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value; + } + +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_y_ptr[counter] = + ((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value; + } + + // Make the zero -g-r-e-a-t- correct again +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + if (h_pointer[counter] == 0) + { + ap_x_ptr[counter] = 0; + } + } + +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + if (w_pointer[counter] == 0) + { + ap_y_ptr[counter] = 0; + } + } + + // res = x*y +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_res_ptr[counter] = static_cast(ap_x_ptr[counter]) * static_cast(ap_y_ptr[counter]); + } + + uint32_t temp; + if (approximation_enable == true) + { + // Go through the vector values + for (counter = 0; counter < pattern_length; counter++) + { + temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask, + number_of_trunc_bits); + if (temp > ap_res_ptr[counter]) + { + ap_res_ptr[counter] = 0; + } + else + { + ap_res_ptr[counter] -= temp; + } + } + } + // Cast from int to float +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + h_pointer[counter] = static_cast(ap_res_ptr[counter]); + } + +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24; + } + + // devide by the 2^number_of_frac_bits +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits; + } + +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] + + ap_h_exponent_ptr[counter]; + if (temp > 252) + { + ap_h_exponent_ptr[counter] = temp - 252; + } + else + { + // Here I try to catch the case that the new exponent is too small + ap_h_exponent_ptr[counter] = 0; + } + } + + // Remove the old exponent +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9; + } + + // Install the new exponent +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23; + } + + // Add the sign back +#pragma omp simd + for (counter = 0; counter < pattern_length; counter++) + { + h_pointer_mod[counter] += sign_temp_ptr[counter]; + } + + return; +} diff --git a/network/CPP_Cuda_new_preview/approximation_multiplication_function.h b/network/CPP_Cuda_new_preview/approximation_multiplication_function.h new file mode 100644 index 0000000..0c44d4f --- /dev/null +++ b/network/CPP_Cuda_new_preview/approximation_multiplication_function.h @@ -0,0 +1,11 @@ +#ifndef APPROXIMATION_MULTIPLICATION_FUNCTION +#define APPROXIMATION_MULTIPLICATION_FUNCTION + +void approximation_multiplication_function( + float* h_pointer, float* w_pointer, int64_t pattern_length, + uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits, + uint32_t* ap_x_ptr, uint32_t* ap_y_ptr, uint32_t* ap_x_exponent_ptr, + uint32_t* ap_y_exponent_ptr, uint32_t* ap_h_exponent_ptr, uint32_t ap_mask, + uint64_t* ap_res_ptr, uint32_t* sign_temp_ptr, bool approximation_enable); + +#endif /* APPROXIMATION_MULTIPLICATION_FUNCTION */ diff --git a/network/CPP_Cuda_new_preview/error_term.cpp b/network/CPP_Cuda_new_preview/error_term.cpp new file mode 100644 index 0000000..f7ffb58 --- /dev/null +++ b/network/CPP_Cuda_new_preview/error_term.cpp @@ -0,0 +1,34 @@ +#include + +#include +#include +#include + +#include "error_term.h" + +uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask, + uint32_t number_of_trunc_bits) +{ + uint32_t error_value = 0; + + uint32_t temp_shift_a = a; + uint32_t temp_shift_b = b & ap_mask; + + uint32_t counter_trunc; + uint32_t temp; + + // Go through the bits + for (counter_trunc = 0; counter_trunc < number_of_trunc_bits; + counter_trunc++) + { + temp = temp_shift_a & 1; + if (temp == 1) + { + error_value += temp_shift_b & ap_mask; + } + temp_shift_a >>= 1; + temp_shift_b <<= 1; + } + + return error_value; +}; diff --git a/network/CPP_Cuda_new_preview/error_term.h b/network/CPP_Cuda_new_preview/error_term.h new file mode 100644 index 0000000..ada4413 --- /dev/null +++ b/network/CPP_Cuda_new_preview/error_term.h @@ -0,0 +1,7 @@ +#ifndef ERROR_TERM +#define ERROR_TERM + +uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask, + uint32_t number_of_trunc_bits); + +#endif /* ERROR_TERM */ diff --git a/network/CPP_Cuda_new_preview/kernel_approximation_error_term.cu b/network/CPP_Cuda_new_preview/kernel_approximation_error_term.cu new file mode 100644 index 0000000..1c0fe7d --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_approximation_error_term.cu @@ -0,0 +1,31 @@ +#include +#include + +__device__ uint32_t gpu_error_term(uint32_t ap_weight_mantissa, + uint32_t ap_input_mantissa, + uint32_t ap_mask, + uint32_t number_of_trunc_bits) +{ + uint32_t error_value = 0; + + uint32_t temp_shift_a = ap_weight_mantissa; + uint32_t temp_shift_b = ap_input_mantissa & ap_mask; + + uint32_t counter_trunc; + uint32_t temp; + + // Go through the bits + for (counter_trunc = 0; counter_trunc < number_of_trunc_bits; + counter_trunc++) + { + temp = temp_shift_a & 1; + if (temp == 1) + { + error_value += temp_shift_b & ap_mask; + } + temp_shift_a >>= 1; + temp_shift_b <<= 1; + } + + return error_value; +} \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.cu b/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.cu new file mode 100644 index 0000000..efea85b --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.cu @@ -0,0 +1,219 @@ +#include +#include + +#include "kernel_approximation_error_term.cu" +#include "kernel_approximation_multiplication.h" +#include "kernel_helper_functions.h" + +// Includes accumulation too... +__global__ void kernel_approximation_multiplication( + float* __restrict__ input_pointer, float* __restrict__ weight_pointer, + float* __restrict__ output_pointer, uint64_t pattern_dim, + uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, + uint64_t input_channel_dim, size_t max_threadable_tasks, + uint64_t input_index_scale, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask, + size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_threadable_tasks) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t feature_id = idx / block_dim_c1; + idx -= feature_id * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + float* weight_pointer_sub = weight_pointer + feature_id * input_channel_dim; + float* input_pointer_sub = input_pointer + + pattern_id * input_channel_dim * x_dim * y_dim + + position_x * y_dim + position_y; + float* output_pointer_sub = + output_pointer + pattern_id * feature_dim * x_dim * y_dim + + feature_id * x_dim * y_dim + position_x * y_dim + position_y; + *output_pointer_sub = 0.0; + + size_t counter; + for (counter = 0; counter < input_channel_dim; counter++) { + *output_pointer_sub += gpu_approximation_multiplication_function( + weight_pointer_sub[counter], + input_pointer_sub[counter * input_index_scale], number_of_frac_bits, + approximation_enable, number_of_trunc_bits, ap_mask); + } + } +}; + +// Only x = a*b +__global__ void kernel_approximation_pure_multiplication( + float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b, + float* __restrict__ phxy_memory_out, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + phxy_memory_out[idx] = gpu_approximation_multiplication_function( + phxy_memory_a[idx], phxy_memory_b[idx], number_of_frac_bits, + approximation_enable, number_of_trunc_bits, ap_mask); + } +}; + +__device__ float gpu_approximation_multiplication_function( + float weight, float input, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, + uint32_t ap_mask) { + float weight_copy = weight; + float input_copy = input; + + uint32_t* weight_pointer_mod = (uint32_t*)&weight_copy; + uint32_t* input_pointer_mod = (uint32_t*)&input_copy; + + // Calculate the new sign + uint32_t sign_temp = + (*weight_pointer_mod & 0x80000000) ^ (*input_pointer_mod & 0x80000000); + + // Extract the exponent + uint32_t ap_input_exponent = (*input_pointer_mod << 1) >> 24; + uint32_t ap_weight_exponent = (*weight_pointer_mod << 1) >> 24; + + // Cast and "normalize" + uint64_t shift_value = 32 - number_of_frac_bits; + + uint32_t ap_input_mantissa = + ((*input_pointer_mod << 8) | 0x80000000) >> shift_value; + + uint32_t ap_weight_mantissa = + ((*weight_pointer_mod << 8) | 0x80000000) >> shift_value; + + // Make the zero -g-r-e-a-t- correct again + if (input == 0) { + ap_input_mantissa = 0; + } + + if (weight == 0) { + ap_weight_mantissa = 0; + } + + // res = x*y + uint64_t ap_result = static_cast(ap_input_mantissa) * + static_cast(ap_weight_mantissa); + + uint32_t temp; + // -------------------------------------------- + // Approx + // -------------------------------------------- + + if (approximation_enable == true) { + // Go through the vector values + temp = gpu_error_term(ap_weight_mantissa, ap_input_mantissa, ap_mask, + number_of_trunc_bits); + if (temp > ap_result) { + ap_result = 0; + } else { + ap_result -= temp; + } + } + + // Cast from int to float + float output = static_cast(ap_result); + if (ap_result == 0) { + output = 0.0; + } else { + uint32_t* output_pointer_mod = (uint32_t*)&output; + + uint32_t ap_output_exponent = (*output_pointer_mod << 1) >> 24; + ap_output_exponent -= 2 * number_of_frac_bits; + temp = ap_input_exponent + ap_weight_exponent + ap_output_exponent; + if (temp > 252) { + ap_output_exponent = temp - 252; + } else { + // Here I try to catch the case that the new exponent is too small + ap_output_exponent = 0; + } + + // Remove the old exponent + *output_pointer_mod = (*output_pointer_mod << 9) >> 9; + + // Install the new exponent + *output_pointer_mod += ap_output_exponent << 23; + + // Add the sign back + *output_pointer_mod += sign_temp; + } + return output; +}; + +void occupancy_kernel_approximation_multiplication(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_approximation_multiplication, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_approximation_multiplication:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; + +// ---------------------------------------------------------------- + +void occupancy_kernel_approximation_pure_multiplication( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + std::vector& output, bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_approximation_pure_multiplication, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_approximation_multiplication:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.h b/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.h new file mode 100644 index 0000000..ac71410 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_approximation_multiplication.h @@ -0,0 +1,35 @@ +#ifndef KERNEL_APPROXIMATION_MULTIPLICATION +#define KERNEL_APPROXIMATION_MULTIPLICATION + +#include + +__global__ void kernel_approximation_multiplication( + float* __restrict__ input_pointer, float* __restrict__ weight_pointer, + float* __restrict__ output_pointer, uint64_t pattern_dim, + uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, + uint64_t input_channel_dim, size_t max_threadable_tasks, + uint64_t input_index_scale, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask, + size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2); + +__global__ void kernel_approximation_pure_multiplication( + float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b, + float* __restrict__ phxy_memory_out, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask, + size_t max_idx); + +__device__ float gpu_approximation_multiplication_function( + float weight, float input, uint64_t number_of_frac_bits, + bool approximation_enable, uint64_t number_of_trunc_bits, uint32_t ap_mask); + +void occupancy_kernel_approximation_multiplication(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +void occupancy_kernel_approximation_pure_multiplication( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + std::vector& output, bool display_debug); + +#endif /* KERNEL_APPROXIMATION_MULTIPLICATION */ diff --git a/network/CPP_Cuda_new_preview/kernel_helper_functions.cu b/network/CPP_Cuda_new_preview/kernel_helper_functions.cu new file mode 100644 index 0000000..bd023e9 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_helper_functions.cu @@ -0,0 +1,17 @@ +#include + +#include "kernel_helper_functions.h" + +void kernel_debug_plot(std::vector output, bool display_debug) { + if (display_debug == true) { + std::cout << "grid x: " << output[0] << std::endl; + std::cout << "grid y: " << output[1] << std::endl; + std::cout << "grid z: " << output[2] << std::endl; + std::cout << "thread block x: " << output[3] << std::endl; + std::cout << "thread block y: " << output[4] << std::endl; + std::cout << "thread block z: " << output[5] << std::endl; + std::cout << "max_idx: " << output[6] << std::endl << std::endl; + } + + return; +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_helper_functions.h b/network/CPP_Cuda_new_preview/kernel_helper_functions.h new file mode 100644 index 0000000..819910e --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_helper_functions.h @@ -0,0 +1,7 @@ +#ifndef KERNEL_HELPER_FUNCTIONS +#define KERNEL_HELPER_FUNCTIONS +#include + +void kernel_debug_plot(std::vector output, bool display_debug); + +#endif /* KERNEL_HELPER_FUNCTIONS */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.cu b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.cu new file mode 100644 index 0000000..d2adf37 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.cu @@ -0,0 +1,64 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_fill_with_h.h" + +__global__ void kernel_phxy_fill_with_h(float* __restrict__ h_memory, + float* __restrict__ phxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t block_dim_c0, + size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 + + position_x * phxy_dim_c2 + position_y] = h_memory[idx_h]; + } +}; + +void occupancy_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_fill_with_h, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_fill_with_h:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.h b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.h new file mode 100644 index 0000000..cbe8f5f --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_h.h @@ -0,0 +1,18 @@ +#ifndef KERNEL_PHXY_FILL_WITH_H +#define KERNEL_PHXY_FILL_WITH_H +#include + +__global__ void kernel_phxy_fill_with_h(float* __restrict__ h_memory, + float* __restrict__ phxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t block_dim_c0, + size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PHXY_FILL_WITH_H */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.cu b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.cu new file mode 100644 index 0000000..0f721e1 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.cu @@ -0,0 +1,73 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_fill_with_spike_selected_w.h" + +__global__ void kernel_phxy_fill_with_spike_selected_w( + float* __restrict__ phxy_memory, float* __restrict__ weights_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, + size_t weights_dim_c0, size_t spike_dim_c0, size_t spike_dim_c1, + size_t spike_dim_c2, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + int64_t* spike = spike_memory + pattern_id * spike_dim_c0 + + spike_time * spike_dim_c1 + position_x * spike_dim_c2 + + position_y; + + if (*spike >= 0) { + phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 + + position_x * phxy_dim_c2 + position_y] = + weights_memory[*spike * weights_dim_c0 + idx_h]; + } else { + phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 + + position_x * phxy_dim_c2 + position_y] = 0.0; + } + } +}; + +void occupancy_kernel_phxy_fill_with_spike_selected_w( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + std::vector& output, bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_phxy_fill_with_spike_selected_w, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_fill_with_spike_selected_w:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.h b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.h new file mode 100644 index 0000000..3055238 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_fill_with_spike_selected_w.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W +#define KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W +#include + +__global__ void kernel_phxy_fill_with_spike_selected_w( + float* __restrict__ phxy_memory, float* __restrict__ weights_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, + size_t weights_dim_c0, size_t spike_dim_c0, size_t spike_dim_c1, + size_t spike_dim_c2, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_fill_with_spike_selected_w( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + std::vector& output, bool display_debug); + +#endif /* KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.cu b/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.cu new file mode 100644 index 0000000..475793f --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.cu @@ -0,0 +1,74 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_one_over_sum_into_pxy.h" + +__global__ void kernel_phxy_one_over_sum_into_pxy( + float* __restrict__ phxy_memory, float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, size_t block_dim_c0, + size_t block_dim_c1, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t position_x = idx / block_dim_c1; + idx -= position_x * block_dim_c1; + size_t position_y = idx; + + size_t offset_phxy_temp = + pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y; + + size_t offset_pxy_sum = + pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y; + + float temp = 0.0; + for (size_t idx_h = 0; idx_h < h_dim; idx_h++) { + temp += phxy_memory[offset_phxy_temp + idx_h * phxy_dim_c1]; + } + if (temp > 1E-10) { + pxy_memory[offset_pxy_sum] = 1.0 / temp; + } else { + pxy_memory[offset_pxy_sum] = 0.0; + } + } +}; + +void occupancy_kernel_phxy_one_over_sum_into_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_phxy_one_over_sum_into_pxy, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_one_over_sum_into_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.h b/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.h new file mode 100644 index 0000000..6f91445 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_one_over_sum_into_pxy.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY +#define KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY +#include + +__global__ void kernel_phxy_one_over_sum_into_pxy( + float* __restrict__ phxy_memory, float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, size_t block_dim_c0, + size_t block_dim_c1, size_t max_idx); + +void occupancy_kernel_phxy_one_over_sum_into_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.cu b/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.cu new file mode 100644 index 0000000..f9ee9c2 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.cu @@ -0,0 +1,51 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_plus_phxy.h" + +__global__ void kernel_phxy_plus_phxy(float* __restrict__ phxy_memory_a, + float* __restrict__ phxy_memory_b, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + phxy_memory_a[idx] += phxy_memory_b[idx]; + } +}; + +void occupancy_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_plus_phxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_plus_phxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.h b/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.h new file mode 100644 index 0000000..a247c90 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_phxy.h @@ -0,0 +1,12 @@ +#ifndef KERNEL_PHXY_PLUS_PHXY +#define KERNEL_PHXY_PLUS_PHXY +#include +__global__ void kernel_phxy_plus_phxy(float* __restrict__ phxy_memory_a, + float* __restrict__ phxy_memory_b, + size_t max_idx); + +void occupancy_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PHXY_PLUS_PHXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.cu b/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.cu new file mode 100644 index 0000000..56bdac7 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.cu @@ -0,0 +1,70 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_plus_pxy.h" + +__global__ void kernel_phxy_plus_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + size_t offset_h_temp = + pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y; + + size_t offset_h_sum = + pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y; + + phxy_memory[offset_h_temp + idx_h * phxy_dim_c1] += + pxy_memory[offset_h_sum]; + } +}; + +void occupancy_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_plus_pxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_plus_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.h b/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.h new file mode 100644 index 0000000..3bacc94 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_plus_pxy.h @@ -0,0 +1,16 @@ +#ifndef KERNEL_PHXY_PLUS_PXY +#define KERNEL_PHXY_PLUS_PXY +#include +__global__ void kernel_phxy_plus_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PHXY_PLUS_PXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.cu b/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.cu new file mode 100644 index 0000000..4a91337 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.cu @@ -0,0 +1,53 @@ +#include +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_times_phxy_equals_phxy.h" + +__global__ void kernel_phxy_times_phxy_equals_phxy( + float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b, + float* __restrict__ phxy_memory_out, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + phxy_memory_out[idx] = phxy_memory_a[idx] * phxy_memory_b[idx]; + } +}; + +void occupancy_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_phxy_times_phxy_equals_phxy, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_times_phxy_equals_phxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.h b/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.h new file mode 100644 index 0000000..b603c2c --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_phxy_equals_phxy.h @@ -0,0 +1,15 @@ +#ifndef KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY +#define KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY +#include + +__global__ void kernel_phxy_times_phxy_equals_phxy( + float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b, + float* __restrict__ phxy_memory_out, size_t max_idx); + +void occupancy_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.cu b/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.cu new file mode 100644 index 0000000..ed7c64e --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.cu @@ -0,0 +1,70 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_times_pxy.h" + +__global__ void kernel_phxy_times_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + size_t offset_h_temp = + pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y; + + size_t offset_h_sum = + pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y; + + phxy_memory[offset_h_temp + idx_h * phxy_dim_c1] *= + pxy_memory[offset_h_sum]; + } +}; + +void occupancy_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_times_pxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_times_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.h b/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.h new file mode 100644 index 0000000..bad68a9 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_phxy_times_pxy.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_PHXY_TIMES_PXY +#define KERNEL_PHXY_TIMES_PXY +#include + +__global__ void kernel_phxy_times_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PHXY_TIMES_PXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.cu b/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.cu new file mode 100644 index 0000000..92d5678 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_plus_v.h" + +__global__ void kernel_pxy_plus_v(float* __restrict__ pxy_memory, float value, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] += value; + } +}; + +void occupancy_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_plus_v, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_plus_v:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.h b/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.h new file mode 100644 index 0000000..853a268 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_plus_v.h @@ -0,0 +1,12 @@ +#ifndef KERNEL_PXY_PLUS_V +#define KERNEL_PXY_PLUS_V +#include + +__global__ void kernel_pxy_plus_v(float* __restrict__ pxy_memory, float value, + size_t max_idx); + +void occupancy_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PXY_PLUS_V */ diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.cu b/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.cu new file mode 100644 index 0000000..beac382 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_reciprocal.h" + +__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] = 1.0 / pxy_memory[idx]; + } +}; + +void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_reciprocal:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.h b/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.h new file mode 100644 index 0000000..f943914 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_reciprocal.h @@ -0,0 +1,12 @@ +#ifndef KERNEL_PXY_RECIPROCAL +#define KERNEL_PXY_RECIPROCAL +#include +__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory, + size_t max_idx); + +void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_RECIPROCAL */ diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.cu b/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.cu new file mode 100644 index 0000000..4f7b640 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_set_to_v.h" + +__global__ void kernel_pxy_set_to_v(float* __restrict__ pxy_memory, float value, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] = value; + } +}; + +void occupancy_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_set_to_v, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_set_to_v:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.h b/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.h new file mode 100644 index 0000000..b2c31d2 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_set_to_v.h @@ -0,0 +1,13 @@ +#ifndef KERNEL_PXY_SET_TO_V +#define KERNEL_PXY_SET_TO_V +#include + +__global__ void kernel_pxy_set_to_v(float* __restrict__ pxy_memory, float value, + size_t max_idx); + +void occupancy_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_SET_TO_V */ diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.cu b/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.cu new file mode 100644 index 0000000..1349a17 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.cu @@ -0,0 +1,58 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_time_pxy.h" + +// a *= b +__global__ void kernel_pxy_time_pxy(float* __restrict__ pxy_memory_a, + float* __restrict__ pxy_memory_b, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory_a[idx] *= pxy_memory_b[idx]; + } +}; + +void occupancy_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_time_pxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + size_t gpu_tuning_factor = 5; + + if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < thread_block_size)) { + thread_block_size = int(gpu_tuning_factor); + } + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_time_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.h b/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.h new file mode 100644 index 0000000..f2c67cb --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_time_pxy.h @@ -0,0 +1,15 @@ +#ifndef KERNEL_PXY_TIME_PXY +#define KERNEL_PXY_TIME_PXY + +#include + +__global__ void kernel_pxy_time_pxy(float* __restrict__ pxy_memory_a, + float* __restrict__ pxy_memory_b, + size_t max_idx); + +void occupancy_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_TIME_PXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.cu b/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.cu new file mode 100644 index 0000000..c5bae95 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.cu @@ -0,0 +1,73 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_times_spike_selected_sxy.h" + +__global__ void kernel_pxy_times_spike_selected_sxy( + float* __restrict__ pxy_memory, float* __restrict__ sxy_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, size_t spike_dim_c0, + size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t position_x = idx / block_dim_c1; + idx -= position_x * block_dim_c1; + size_t position_y = idx; + + int64_t* spike = spike_memory + pattern_id * spike_dim_c0 + + spike_time * spike_dim_c1 + position_x * spike_dim_c2 + + position_y; + + if (*spike >= 0) { + pxy_memory[pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + + position_y] *= + sxy_memory[*spike * sxy_dim_c0 + position_x * sxy_dim_c1 + + position_y]; + } else { + pxy_memory[pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + + position_y] = 0; + } + } +}; + +void occupancy_kernel_pxy_times_spike_selected_sxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_pxy_times_spike_selected_sxy, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_times_spike_selected_sxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.h b/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.h new file mode 100644 index 0000000..4bc60d5 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_spike_selected_sxy.h @@ -0,0 +1,18 @@ +#ifndef KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY +#define KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY +#include + +__global__ void kernel_pxy_times_spike_selected_sxy( + float* __restrict__ pxy_memory, float* __restrict__ sxy_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, size_t spike_dim_c0, + size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, size_t max_idx); + +void occupancy_kernel_pxy_times_spike_selected_sxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY */ diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_times_v.cu b/network/CPP_Cuda_new_preview/kernel_pxy_times_v.cu new file mode 100644 index 0000000..1823f6f --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_v.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_times_v.h" + +__global__ void kernel_pxy_times_v(float* __restrict__ pxy_memory, float value, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] *= value; + } +}; + +void occupancy_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_times_v, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_times_v:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/CPP_Cuda_new_preview/kernel_pxy_times_v.h b/network/CPP_Cuda_new_preview/kernel_pxy_times_v.h new file mode 100644 index 0000000..611e0ec --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_pxy_times_v.h @@ -0,0 +1,13 @@ +#ifndef KERNEL_PXY_TIMES_V +#define KERNEL_PXY_TIMES_V +#include + +__global__ void kernel_pxy_times_v(float* __restrict__ pxy_memory, float value, + size_t max_idx); + +void occupancy_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_TIMES_V */ diff --git a/network/CPP_Cuda_new_preview/kernel_spike_generation.cu b/network/CPP_Cuda_new_preview/kernel_spike_generation.cu new file mode 100644 index 0000000..11068a8 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_spike_generation.cu @@ -0,0 +1,97 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_spike_generation.h" + +__device__ size_t gpu_lower_bound(float* __restrict__ data_ptr, + size_t data_length, size_t data_ptr_stride, + float compare_to_value) { + size_t start_of_range = 0; + size_t length_of_range = data_length; + + while (length_of_range != 0) { + size_t half_length = length_of_range >> 1; + size_t actual_position = start_of_range + half_length; + + if (data_ptr[actual_position * data_ptr_stride] < compare_to_value) { + start_of_range = ++actual_position; + length_of_range -= half_length + 1; + } else + length_of_range = half_length; + } + return start_of_range; +}; + +__global__ void kernel_spike_generation( + float* __restrict__ input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, float* __restrict__ random_values_pointer, + size_t random_values_dim_c0, size_t random_values_dim_c1, + size_t random_values_dim_c2, int64_t* __restrict__ output_pointer, + size_t output_dim_c0, size_t output_dim_c1, size_t output_dim_c2, + size_t x_dim, size_t y_dim, size_t spike_dim, size_t h_dim, + size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2, + size_t max_threadable_tasks) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_threadable_tasks) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t position_spike = idx / block_dim_c1; + idx -= position_spike * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + float* p_ptr = input_pointer + pattern_id * input_dim_c0 + + position_x * input_dim_c2 + position_y; + + int64_t* out_ptr = output_pointer + pattern_id * output_dim_c0 + + position_x * output_dim_c2 + position_y + + position_spike * output_dim_c1; + + float* rand_ptr = random_values_pointer + + pattern_id * random_values_dim_c0 + + position_x * random_values_dim_c2 + position_y + + position_spike * random_values_dim_c1; + + *out_ptr = gpu_lower_bound(p_ptr, h_dim, input_dim_c1, *rand_ptr); + } +}; + +void occupancy_kernel_spike_generation(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t spike_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * spike_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_spike_generation, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_spike_generation:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/CPP_Cuda_new_preview/kernel_spike_generation.h b/network/CPP_Cuda_new_preview/kernel_spike_generation.h new file mode 100644 index 0000000..e25e599 --- /dev/null +++ b/network/CPP_Cuda_new_preview/kernel_spike_generation.h @@ -0,0 +1,21 @@ +#ifndef KERNEL_SPIKE_GENERATION +#define KERNEL_SPIKE_GENERATION +#include + +__global__ void kernel_spike_generation( + float* __restrict__ input_pointer, size_t input_dim_c0, size_t input_dim_c1, + size_t input_dim_c2, float* __restrict__ random_values_pointer, + size_t random_values_dim_c0, size_t random_values_dim_c1, + size_t random_values_dim_c2, int64_t* __restrict__ output_pointer, + size_t output_dim_c0, size_t output_dim_c1, size_t output_dim_c2, + size_t x_dim, size_t y_dim, size_t spike_dim, size_t h_dim, + size_t block_dim_c0, size_t block_dim_c1, size_t block_dim_c2, + size_t max_threadable_tasks); + +void occupancy_kernel_spike_generation(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t spike_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_SPIKE_GENERATION */ diff --git a/network/CPP_Cuda_new_preview/pybind11_auto_pyi.py b/network/CPP_Cuda_new_preview/pybind11_auto_pyi.py new file mode 100644 index 0000000..73c8bd2 --- /dev/null +++ b/network/CPP_Cuda_new_preview/pybind11_auto_pyi.py @@ -0,0 +1,23 @@ +# %% +# pip install pybind11-stubgen +from pybind11_stubgen import ModuleStubsGenerator # type: ignore +import glob + + +def process(module_name: str) -> None: + module = ModuleStubsGenerator(module_name) + module.parse() + module.write_setup_py = False + + with open(module_name + ".pyi", "w") as fp: + fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n") + fp.write("\n".join(module.to_lines())) + + +Files = glob.glob("*.so") + +for fid in Files: + Idx: int = fid.find(".") + module_name: str = fid[:Idx] + print("Processing: " + module_name) + process(module_name) diff --git a/network/CPP_Cuda_new_preview/test_kernel.py b/network/CPP_Cuda_new_preview/test_kernel.py new file mode 100644 index 0000000..f6295c9 --- /dev/null +++ b/network/CPP_Cuda_new_preview/test_kernel.py @@ -0,0 +1,767 @@ +import torch +import math +import random + +from PyTestKernel import TestKernel + +# TODO: kernel_phxy_plus_pxy, kernel_phxy_times_pxy, +# kernel_phxy_fill_h, kernel_phxy_one_over_sum_into_pxy, +# test_kernel_phxy_fill_with_spike_selected_w => 4D index + +# pxy = number_of_pattern * dim_x * dim_y +# phxy = number_of_pattern * h_dim * dim_x * dim_y +# sxy = s_dim * dim_x * dim_y + + +def test_kernel_pxy_times_spike_selected_sxy( + h_dim, + s_dim, + number_of_pattern, + dim_x, + dim_y, + display_debug, + spike_time, + number_of_spikes, +): + print("test_kernel_pxy_times_spike_selected_sxy") + # void test_kernel_pxy_times_spike_selected_sxy( + # size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + # bool display_debug, int64_t pxy_memory_addr, int64_t sxy_memory_addr, + # int64_t spike_memory_addr, size_t spike_time, size_t spike_dim_c0, + # size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + # size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1); + + memory_pxy = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_sxy = torch.rand( + (s_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_spikes = ( + torch.rand( + (number_of_pattern, number_of_spikes, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + * float(s_dim) + ).type(dtype=torch.int64) + + pxy_dim_c0 = int(dim_x * dim_y) + pxy_dim_c1 = int(dim_y) + + sxy_dim_c0 = int(dim_x * dim_y) + sxy_dim_c1 = int(dim_y) + + spike_dim_c0 = int(number_of_spikes * dim_x * dim_y) + spike_dim_c1 = int(dim_x * dim_y) + spike_dim_c2 = int(dim_y) + + memory_pxy_copy = memory_pxy.clone() + memory_sxy_copy = memory_sxy.clone() + memory_spikes_copy = memory_spikes.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_pxy_times_spike_selected_sxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + memory_pxy.data_ptr(), + memory_sxy.data_ptr(), + memory_spikes.data_ptr(), + spike_time, + spike_dim_c0, + spike_dim_c1, + spike_dim_c2, + pxy_dim_c0, + pxy_dim_c1, + sxy_dim_c0, + sxy_dim_c1, + ) + + for p in range(0, memory_spikes_copy.shape[0]): + for x in range(0, memory_spikes_copy.shape[2]): + for y in range(0, memory_spikes_copy.shape[3]): + spike = memory_spikes_copy[p, spike_time, x, y] + + if spike >= 0: + memory_pxy_copy[p, x, y] *= memory_sxy_copy[spike, x, y] + else: + memory_pxy_copy[p, x, y] = 0.0 + print(f"difference: {torch.abs(memory_pxy - memory_pxy_copy).max():.4e}") + print() + + +def test_kernel_phxy_fill_with_spike_selected_w( + h_dim, + s_dim, + number_of_pattern, + dim_x, + dim_y, + display_debug, + spike_time, + number_of_spikes, +): + print("test_kernel_phxy_fill_with_spike_selected_w") + # void test_kernel_phxy_fill_with_spike_selected_w( + # size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + # bool display_debug, size_t spike_time, size_t weights_dim_c0, + # size_t spike_dim_c0, size_t spike_dim_c1, size_t spike_dim_c2, + # size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, + # int64_t phxy_memory_addr, int64_t weight_memory_addr, + # int64_t spike_memory_addr); + + memory_phxy = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_w = torch.rand( + (s_dim, h_dim), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_spikes = ( + torch.rand( + (number_of_pattern, number_of_spikes, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + * float(s_dim) + ).type(dtype=torch.int64) + + phxy_dim_c0 = int(h_dim * dim_x * dim_y) + phxy_dim_c1 = int(dim_x * dim_y) + phxy_dim_c2 = int(dim_y) + + spike_dim_c0 = int(number_of_spikes * dim_x * dim_y) + spike_dim_c1 = int(dim_x * dim_y) + spike_dim_c2 = int(dim_y) + + weights_dim_c0 = int(h_dim) + + memory_phxy_copy = memory_phxy.clone() + memory_w_copy = memory_w.clone() + memory_spikes_copy = memory_spikes.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_fill_with_spike_selected_w( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + spike_time, + weights_dim_c0, + spike_dim_c0, + spike_dim_c1, + spike_dim_c2, + phxy_dim_c0, + phxy_dim_c1, + phxy_dim_c2, + memory_phxy.data_ptr(), + memory_w.data_ptr(), + memory_spikes.data_ptr(), + ) + + for p in range(0, memory_spikes_copy.shape[0]): + for x in range(0, memory_spikes_copy.shape[2]): + for y in range(0, memory_spikes_copy.shape[3]): + spike = memory_spikes_copy[p, spike_time, x, y] + + if spike >= 0: + memory_phxy_copy[p, :, x, y] = memory_w_copy[spike, :] + else: + memory_phxy_copy[p, :, x, y] = 0.0 + + print(f"difference: {torch.abs(memory_phxy - memory_phxy_copy).max():.4e}") + print() + + +def test_kernel_phxy_one_over_sum_into_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_phxy_one_over_sum_into_pxy") + # void test_kernel_phxy_one_over_sum_into_pxy( + # size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + # bool display_debug, size_t phxy_dim_c0, size_t phxy_dim_c1, + # size_t phxy_dim_c2, size_t pxy_dim_c0, size_t pxy_dim_c1, + # int64_t phxy_memory_addr, int64_t pxy_memory_addr); + + memory_a = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_b = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + pxy_dim_c0 = int(dim_x * dim_y) + pxy_dim_c1 = int(dim_y) + + phxy_dim_c0 = int(h_dim * dim_x * dim_y) + phxy_dim_c1 = int(dim_x * dim_y) + phxy_dim_c2 = int(dim_y) + + memory_a_copy = memory_a.clone() + memory_b_copy = memory_b.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_one_over_sum_into_pxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + phxy_dim_c0, + phxy_dim_c1, + phxy_dim_c2, + pxy_dim_c0, + pxy_dim_c1, + memory_a.data_ptr(), + memory_b.data_ptr(), + ) + memory_temp_copy = memory_a_copy.sum(dim=1) + + memory_b_copy = torch.where(memory_temp_copy > 1e-10, 1.0 / memory_temp_copy, 0.0) + print( + "Remember: \nAn error of 0 is very unlikely due to different \nrandom order of values for the sum." + ) + print(f"difference: {torch.abs(memory_b - memory_b_copy).max():.4e}") + print() + + +def test_kernel_phxy_fill_with_h( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_phxy_fill_with_h") + # void test_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, size_t phxy_dim_c0, + # size_t phxy_dim_c1, size_t phxy_dim_c2, + # int64_t h_memory_addr, + # int64_t phxy_memory_addr); + + memory_a = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_h = torch.rand( + (h_dim), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + phxy_dim_c0 = int(h_dim * dim_x * dim_y) + phxy_dim_c1 = int(dim_x * dim_y) + phxy_dim_c2 = int(dim_y) + + memory_a_copy = memory_a.clone() + memory_h_copy = memory_h.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_fill_with_h( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + phxy_dim_c0, + phxy_dim_c1, + phxy_dim_c2, + memory_h.data_ptr(), + memory_a.data_ptr(), + ) + for p in range(0, memory_a_copy.shape[0]): + for x in range(0, memory_a_copy.shape[2]): + for y in range(0, memory_a_copy.shape[3]): + memory_a_copy[p, :, x, y] = memory_h_copy + + print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}") + print() + + +def test_kernel_phxy_plus_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_phxy_plus_pxy") + # void test_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, size_t phxy_dim_c0, + # size_t phxy_dim_c1, size_t phxy_dim_c2, + # size_t pxy_dim_c0, size_t pxy_dim_c1, + # int64_t phxy_memory_addr, + # int64_t pxy_memory_addr); + + memory_a = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_b = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + pxy_dim_c0 = int(dim_x * dim_y) + pxy_dim_c1 = int(dim_y) + + phxy_dim_c0 = int(h_dim * dim_x * dim_y) + phxy_dim_c1 = int(dim_x * dim_y) + phxy_dim_c2 = int(dim_y) + + memory_a_copy = memory_a.clone() + memory_b_copy = memory_b.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_plus_pxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + phxy_dim_c0, + phxy_dim_c1, + phxy_dim_c2, + pxy_dim_c0, + pxy_dim_c1, + memory_a.data_ptr(), + memory_b.data_ptr(), + ) + + memory_a_copy += memory_b_copy.unsqueeze(1) + + print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}") + print() + + +def test_kernel_phxy_times_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_phxy_times_pxy") + # void test_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, size_t phxy_dim_c0, + # size_t phxy_dim_c1, size_t phxy_dim_c2, + # size_t pxy_dim_c0, size_t pxy_dim_c1, + # int64_t phxy_memory_addr, + # int64_t pxy_memory_addr); + + memory_a = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_b = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + pxy_dim_c0 = int(dim_x * dim_y) + pxy_dim_c1 = int(dim_y) + + phxy_dim_c0 = int(h_dim * dim_x * dim_y) + phxy_dim_c1 = int(dim_x * dim_y) + phxy_dim_c2 = int(dim_y) + + memory_a_copy = memory_a.clone() + memory_b_copy = memory_b.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_times_pxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + phxy_dim_c0, + phxy_dim_c1, + phxy_dim_c2, + pxy_dim_c0, + pxy_dim_c1, + memory_a.data_ptr(), + memory_b.data_ptr(), + ) + + memory_a_copy *= memory_b_copy.unsqueeze(1) + + print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}") + print() + + +def test_kernel_phxy_times_phxy_equals_phxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_phxy_times_phxy_equals_phxy") + # void test_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, + # size_t h_dim, bool display_debug, + # int64_t phxy_memory_a_addr, + # int64_t phxy_memory_b_addr, + # int64_t phxy_memory_out_addr); + + memory_a = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_b = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_out = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_a_copy = memory_a.clone() + memory_b_copy = memory_b.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_times_phxy_equals_phxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + memory_a.data_ptr(), + memory_b.data_ptr(), + memory_out.data_ptr(), + ) + + memory_out_copy = memory_a_copy * memory_b_copy + + print(f"difference: {torch.abs(memory_out - memory_out_copy).max():.4e}") + print() + + +def test_kernel_phxy_plus_phxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_pxy_time_pxy") + # void test_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, + # int64_t phxy_memory_a_addr, + # int64_t phxy_memory_b_addr); + + memory_a = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_b = torch.rand( + (number_of_pattern, h_dim, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + memory_a_copy = memory_a.clone() + memory_b_copy = memory_b.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_phxy_plus_phxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + memory_a.data_ptr(), + memory_b.data_ptr(), + ) + + memory_a_copy += memory_b_copy + + print(f"difference: {torch.abs(memory_a - memory_a_copy).max():.4e}") + print() + + +def test_kernel_pxy_time_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_pxy_time_pxy") + + # void test_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, int64_t pxy_memory_a_addr, + # int64_t pxy_memory_b_addr); + + epsilon_memory_a = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + epsilon_memory_b = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + epsilon_memory_a_copy = epsilon_memory_a.clone() + epsilon_memory_b_copy = epsilon_memory_b.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_pxy_time_pxy( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + epsilon_memory_a.data_ptr(), + epsilon_memory_b.data_ptr(), + ) + + epsilon_memory_a_copy *= epsilon_memory_b_copy + + print( + f"difference: {torch.abs(epsilon_memory_a - epsilon_memory_a_copy).max():.4e}" + ) + print() + + +def test_kernel_pxy_times_v( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_pxy_times_v") + + # void test_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, float value, + # int64_t pxy_memory_addr); + + epsilon_memory = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + epsilon_memory_copy = epsilon_memory.clone() + value = float(math.pi) + + my_kernels = TestKernel() + my_kernels.test_kernel_pxy_times_v( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + value, + epsilon_memory.data_ptr(), + ) + + epsilon_memory_copy = epsilon_memory_copy * value + + print(f"difference: {torch.abs(epsilon_memory - epsilon_memory_copy).max():.4e}") + print() + + +def test_kernel_pxy_plus_v( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_pxy_plus_v") + # void test_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, float value, + # int64_t pxy_memory_addr); + + epsilon_memory = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + epsilon_memory_copy = epsilon_memory.clone() + value = float(math.pi) + + my_kernels = TestKernel() + my_kernels.test_kernel_pxy_plus_v( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + value, + epsilon_memory.data_ptr(), + ) + + epsilon_memory_copy = epsilon_memory_copy + value + + print(f"difference: {torch.abs(epsilon_memory - epsilon_memory_copy).max():.4e}") + print() + + +def test_kernel_pxy_set_to_v( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + + print("test_kernel_pxy_set_to_v") + # void test_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, float value, + # int64_t pxy_memory_addr); + + set_value = float(math.pi) + + epsilon_memory = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + my_kernels = TestKernel() + my_kernels.test_kernel_pxy_set_to_v( + dim_x, + dim_y, + number_of_pattern, + h_dim, + display_debug, + set_value, + epsilon_memory.data_ptr(), + ) + + print(f"difference: {torch.abs(epsilon_memory - set_value).max():.4e}") + print() + + +def test_kernel_pxy_reciprocal( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug +): + print("test_kernel_pxy_reciprocal") + # void test_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + # size_t number_of_pattern, size_t h_dim, + # bool display_debug, int64_t pxy_memory_addr); + + epsilon_memory = torch.rand( + (number_of_pattern, dim_x, dim_y), + dtype=torch.float32, + device=torch.device("cuda:0"), + ) + + epsilon_memory_copy = epsilon_memory.clone() + + my_kernels = TestKernel() + my_kernels.test_kernel_pxy_reciprocal( + dim_x, dim_y, number_of_pattern, h_dim, display_debug, epsilon_memory.data_ptr() + ) + + epsilon_memory_copy = 1.0 / epsilon_memory_copy + + print(f"difference: {torch.abs(epsilon_memory - epsilon_memory_copy).max():.4e}") + print() + + +if __name__ == "__main__": + input_set = 0 + + for test_id in range(0, 13): + print(f"Test-ID: {test_id}") + + number_of_spikes: int = int(1600) + spike_time: int = int(random.random() * number_of_spikes) + + if input_set == 0: + h_dim: int = int(32) + s_dim: int = int(1 * 5 * 5) + number_of_pattern: int = int(24) + dim_x: int = int(20) + dim_y: int = int(20) + display_debug: int = bool(False) + else: + h_dim = int(10) + s_dim = int(32 * 20 * 20) + number_of_pattern = int(24) + dim_x = int(1) + dim_y = int(1) + display_debug = bool(False) + + if test_id == 0: + test_kernel_pxy_reciprocal( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 1: + test_kernel_pxy_set_to_v( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 2: + test_kernel_pxy_plus_v( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 3: + test_kernel_pxy_times_v( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 4: + test_kernel_pxy_time_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 5: + test_kernel_phxy_plus_phxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 6: + test_kernel_phxy_times_phxy_equals_phxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 7: + test_kernel_phxy_times_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 8: + test_kernel_phxy_plus_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 9: + test_kernel_phxy_fill_with_h( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 10: + test_kernel_phxy_one_over_sum_into_pxy( + h_dim, s_dim, number_of_pattern, dim_x, dim_y, display_debug + ) + elif test_id == 11: + test_kernel_phxy_fill_with_spike_selected_w( + h_dim, + s_dim, + number_of_pattern, + dim_x, + dim_y, + display_debug, + spike_time, + number_of_spikes, + ) + elif test_id == 12: + test_kernel_pxy_times_spike_selected_sxy( + h_dim, + s_dim, + number_of_pattern, + dim_x, + dim_y, + display_debug, + spike_time, + number_of_spikes, + )