From b44e66d461a91480ed240832c02cc2b89828d714 Mon Sep 17 00:00:00 2001 From: David Rotermund <54365609+davrot@users.noreply.github.com> Date: Thu, 2 Feb 2023 19:15:39 +0100 Subject: [PATCH] Delete network/CPP_Cuda directory --- network/CPP_Cuda/HDynamicCNNManyIP.cu | 736 ------------------ network/CPP_Cuda/HDynamicCNNManyIP.h | 125 --- network/CPP_Cuda/Makefile | 67 -- network/CPP_Cuda/MultiApp.cu | 313 -------- network/CPP_Cuda/MultiApp.h | 39 - network/CPP_Cuda/PyHDynamicCNNManyIP.cpp | 18 - network/CPP_Cuda/PyHDynamicCNNManyIP.pyi | 18 - network/CPP_Cuda/PyMultiApp.cpp | 14 - network/CPP_Cuda/PyMultiApp.pyi | 18 - .../CPP_Cuda/PySpikeGeneration2DManyIP.cpp | 19 - .../CPP_Cuda/PySpikeGeneration2DManyIP.pyi | 18 - network/CPP_Cuda/SpikeGeneration2DManyIP.cu | 411 ---------- network/CPP_Cuda/SpikeGeneration2DManyIP.h | 91 --- .../approximation_multiplication_function.cpp | 138 ---- network/CPP_Cuda/error_term.cpp | 28 - ...u_approximation_multiplication_function.cu | 102 --- network/CPP_Cuda/gpu_error_term.cu | 28 - network/CPP_Cuda/pybind11_auto_pyi.py | 23 - 18 files changed, 2206 deletions(-) delete mode 100644 network/CPP_Cuda/HDynamicCNNManyIP.cu delete mode 100644 network/CPP_Cuda/HDynamicCNNManyIP.h delete mode 100644 network/CPP_Cuda/Makefile delete mode 100644 network/CPP_Cuda/MultiApp.cu delete mode 100644 network/CPP_Cuda/MultiApp.h delete mode 100644 network/CPP_Cuda/PyHDynamicCNNManyIP.cpp delete mode 100644 network/CPP_Cuda/PyHDynamicCNNManyIP.pyi delete mode 100644 network/CPP_Cuda/PyMultiApp.cpp delete mode 100644 network/CPP_Cuda/PyMultiApp.pyi delete mode 100644 network/CPP_Cuda/PySpikeGeneration2DManyIP.cpp delete mode 100644 network/CPP_Cuda/PySpikeGeneration2DManyIP.pyi delete mode 100644 network/CPP_Cuda/SpikeGeneration2DManyIP.cu delete mode 100644 network/CPP_Cuda/SpikeGeneration2DManyIP.h delete mode 100644 network/CPP_Cuda/approximation_multiplication_function.cpp delete mode 100644 network/CPP_Cuda/error_term.cpp delete mode 100644 network/CPP_Cuda/gpu_approximation_multiplication_function.cu delete mode 100644 network/CPP_Cuda/gpu_error_term.cu delete mode 100644 network/CPP_Cuda/pybind11_auto_pyi.py diff --git a/network/CPP_Cuda/HDynamicCNNManyIP.cu b/network/CPP_Cuda/HDynamicCNNManyIP.cu deleted file mode 100644 index 723b66f..0000000 --- a/network/CPP_Cuda/HDynamicCNNManyIP.cu +++ /dev/null @@ -1,736 +0,0 @@ -#include "HDynamicCNNManyIP.h" - -#include -#include -#include - -#include -#include -#include - - -HDynamicCNNManyIP::HDynamicCNNManyIP() -{ - -}; - -HDynamicCNNManyIP::~HDynamicCNNManyIP() -{ - -}; - -bool HDynamicCNNManyIP::update_entrypoint( - int64_t h_pointer_addr, - int64_t h_dim_0, - int64_t h_dim_1, - int64_t h_dim_2, - int64_t h_dim_3, - int64_t epsilon_xy_pointer_addr, - int64_t epsilon_xy_dim_0, - int64_t epsilon_xy_dim_1, - int64_t epsilon_xy_dim_2, - int64_t epsilon_t_pointer_addr, - int64_t epsilon_t_dim_0, - int64_t weights_pointer_addr, - int64_t weights_dim_0, - int64_t weights_dim_1, - int64_t input_pointer_addr, - int64_t input_dim_0, - int64_t input_dim_1, - int64_t input_dim_2, - int64_t input_dim_3, - int64_t init_vector_pointer_addr, - int64_t init_vector_dim_0, - int64_t number_of_processes, - float forgetting_offset, - int64_t gpu_tuning_factor) -{ - - size_t number_of_pattern = input_dim_0; - - size_t h_dim = init_vector_dim_0; - float* h_init_ptr = (float*)init_vector_pointer_addr; - assert((h_init_ptr != nullptr)); - assert((h_dim > 0)); - - float* h_pointer = (float*)h_pointer_addr; - assert((h_pointer != nullptr)); - assert((h_dim_0 > 0)); - assert((h_dim_1 > 0)); - assert((h_dim_2 > 0)); - assert((h_dim_3 > 0)); - - size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3; - size_t h_dim_c1 = h_dim_2 * h_dim_3; - size_t h_dim_c2 = h_dim_3; - - float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr; - assert((epsilon_xy_pointer != nullptr)); - assert((epsilon_xy_dim_0 > 0)); - assert((epsilon_xy_dim_1 > 0)); - - size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1; - size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2; - - float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr; - assert((epsilon_t_pointer != nullptr)); - assert((epsilon_t_dim_0 > 0)); - - float* weights_pointer = (float*)weights_pointer_addr; - assert((weights_pointer != nullptr)); - assert((weights_dim_0 > 0)); - assert((weights_dim_1 > 0)); - - size_t weights_dim_c0 = weights_dim_1; - - int64_t* input_pointer = (int64_t*)input_pointer_addr; - assert((input_pointer != nullptr)); - assert((input_dim_0 > 0)); - assert((input_dim_1 > 0)); - assert((input_dim_2 > 0)); - assert((input_dim_3 > 0)); - - size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3; - size_t input_dim_c1 = input_dim_2 * input_dim_3; - size_t input_dim_c2 = input_dim_3; - - assert((h_dim == weights_dim_1)); - size_t number_of_spikes = input_dim_1; - size_t dim_x = input_dim_2; - size_t dim_y = input_dim_3; - - float forgetting_offset_local = forgetting_offset / static_cast(h_dim); - - - // -------------------- - if (number_of_processes > 0) - { - omp_set_num_threads(number_of_processes); - - size_t pattern_id; -#pragma omp parallel for - for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) - { - update( - h_init_ptr, - h_pointer, - h_dim_c0, - h_dim_c1, - h_dim_c2, - h_dim, - epsilon_xy_pointer, - epsilon_xy_dim_c0, - epsilon_xy_dim_c1, - epsilon_t_pointer, - weights_pointer, - weights_dim_c0, - input_pointer, - input_dim_c0, - input_dim_c1, - input_dim_c2, - number_of_spikes, - dim_x, - dim_y, - forgetting_offset, - forgetting_offset_local, - pattern_id); - } - } - else - { - gpu_update( - h_init_ptr, - h_pointer, - h_dim_c0, - h_dim_c1, - h_dim_c2, - h_dim, - epsilon_xy_pointer, - epsilon_xy_dim_c0, - epsilon_xy_dim_c1, - epsilon_t_pointer, - weights_pointer, - weights_dim_c0, - input_pointer, - input_dim_c0, - input_dim_c1, - input_dim_c2, - number_of_spikes, - dim_x, - dim_y, - forgetting_offset, - forgetting_offset_local, - number_of_pattern, - gpu_tuning_factor); - - } - return true; -}; - - -bool HDynamicCNNManyIP::update( - float* h_init_ptr, - float* h_pointer, - size_t h_dim_c0, - size_t h_dim_c1, - size_t h_dim_c2, - size_t h_dim, - float* epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - size_t epsilon_xy_dim_c1, - float* epsilon_t_pointer, - float* weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - size_t number_of_spikes, - size_t dim_x, - size_t dim_y, - float forgetting_offset, - float forgetting_offset_local, - size_t pattern_id) -{ - - float* h_ptr; - float* epsilon_xy_ptr; - int64_t* input_ptr; - - size_t counter_x; - size_t counter_y; - - for (counter_x = 0; counter_x < dim_x; counter_x++) - { - for (counter_y = 0; counter_y < dim_y; counter_y++) - { - epsilon_xy_ptr = epsilon_xy_pointer + - counter_x * epsilon_xy_dim_c1 + counter_y; - - h_ptr = h_pointer + - pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y; - - input_ptr = input_pointer + - pattern_id * input_dim_c0 + counter_x * input_dim_c2 + counter_y; - - update_one_ip( - h_init_ptr, - h_ptr, - h_dim_c1, - h_dim, - weights_pointer, - weights_dim_c0, - input_ptr, - input_dim_c1, - epsilon_xy_ptr, - epsilon_xy_dim_c0, - epsilon_t_pointer, - number_of_spikes, - forgetting_offset, - forgetting_offset_local); - - } - } - - return true; -}; - -void HDynamicCNNManyIP::update_one_ip( - float* h_init_ptr, - float* h_pointer, - size_t h_dim_c1, - size_t h_dim, - float* weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c1, - float* epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - float* epsilon_t_pointer, - size_t number_of_spikes, - float forgetting_offset, - float forgetting_offset_local) -{ - - float* h_temp = new float[h_dim]; - float* h_subsegment = new float[h_dim]; - - memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim); - - size_t counter_spike; - size_t counter; - - float h_temp_sum; - float temp_value; - - float epsilon_subsegment; - float epsilon_scale = 1.0; - - int64_t* spike; - float* w_ptr; - - for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) - { - if (epsilon_scale > 1E10) - { - temp_value = 1.0 / epsilon_scale; - -#pragma omp simd - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] *= temp_value; - } - - epsilon_scale = 1.0; - } - - spike = input_pointer + counter_spike * input_dim_c1; - - if (*spike >= 0) - { - epsilon_subsegment = - epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike]; - - w_ptr = weights_pointer + *spike * weights_dim_c0; - - memcpy(h_temp, h_subsegment, sizeof(float) * h_dim); - -#pragma omp simd - for (counter = 0; counter < h_dim; counter++) - { - h_temp[counter] *= w_ptr[counter]; - } - - h_temp_sum = 0.0; -#pragma omp simd reduction(+ : h_temp_sum) - for (counter = 0; counter < h_dim; counter++) - { - h_temp_sum += h_temp[counter]; - } - - if (h_temp_sum > 1E-10) - { - temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum; - -#pragma omp simd - for (counter = 0; counter < h_dim; counter++) - { - h_temp[counter] *= temp_value; - } - -#pragma omp simd - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] += h_temp[counter]; - } - - if (forgetting_offset_local > 0.0) - { - temp_value = - epsilon_scale * epsilon_subsegment * forgetting_offset_local; - -#pragma omp simd - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] += temp_value; - } - - epsilon_scale *= - 1.0 + epsilon_subsegment * (1.0 + forgetting_offset); - } - else - { - epsilon_scale *= 1.0 + epsilon_subsegment * 1.0; - } - } - } - } - - temp_value = 1.0 / epsilon_scale; -#pragma omp simd - for (counter = 0; counter < h_dim; counter++) - { - h_pointer[counter * h_dim_c1] = - h_subsegment[counter] * temp_value; - } - - delete[] h_temp; - delete[] h_subsegment; - - return; -}; - -__device__ void gpu_update_one_ip( - float* __restrict__ h_init_ptr, - float* __restrict__ h_pointer, - size_t h_dim_c1, - size_t h_dim, - float* __restrict__ weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c1, - float* __restrict__ epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - float* __restrict__ epsilon_t_pointer, - size_t number_of_spikes, - float forgetting_offset, - float forgetting_offset_local, - float* __restrict__ h_temp, - float* __restrict__ h_subsegment -) -{ - - size_t counter_spike; - size_t counter; - - float h_temp_sum; - float temp_value; - - float epsilon_subsegment; - float epsilon_scale = 1.0; - - int64_t* spike; - float* w_ptr; - - // float* h_temp = new float[h_dim]; - // float* h_subsegment = new float[h_dim]; - - // Initialize the sub-segement - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] = h_init_ptr[counter]; - } - - for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) - { - if (epsilon_scale > 1E10) - { - temp_value = 1.0 / epsilon_scale; - - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] *= temp_value; - } - - epsilon_scale = 1.0; - } - - spike = input_pointer + counter_spike * input_dim_c1; - - if (*spike >= 0) - { - epsilon_subsegment = - epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike]; - - w_ptr = weights_pointer + *spike * weights_dim_c0; - - for (counter = 0; counter < h_dim; counter++) - { - h_temp[counter] = h_subsegment[counter] * w_ptr[counter]; - } - - h_temp_sum = 0.0; - - for (counter = 0; counter < h_dim; counter++) - { - h_temp_sum += h_temp[counter]; - } - - if (h_temp_sum > 1E-10) - { - temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum; - - for (counter = 0; counter < h_dim; counter++) - { - h_temp[counter] *= temp_value; - } - - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] += h_temp[counter]; - } - - if (forgetting_offset_local > 0.0) - { - temp_value = - epsilon_scale * epsilon_subsegment * forgetting_offset_local; - - for (counter = 0; counter < h_dim; counter++) - { - h_subsegment[counter] += temp_value; - } - - epsilon_scale *= - 1.0 + epsilon_subsegment * (1.0 + forgetting_offset); - } - else - { - epsilon_scale *= 1.0 + epsilon_subsegment * 1.0; - } - } - } - } - - temp_value = 1.0 / epsilon_scale; - - for (counter = 0; counter < h_dim; counter++) - { - h_pointer[counter * h_dim_c1] = - h_subsegment[counter] * temp_value; - } - - // delete[] h_temp; - // delete[] h_subsegment; - - return; -}; - -__global__ void kernel_spike_generation( - float* __restrict__ h_init_ptr, - float* __restrict__ h_pointer, - size_t h_dim_c0, - size_t h_dim_c1, - size_t h_dim_c2, - size_t h_dim, - float* __restrict__ weights_pointer, - size_t weights_dim_c0, - int64_t* __restrict__ input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - float* __restrict__ epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - size_t epsilon_xy_dim_c1, - float* __restrict__ epsilon_t_pointer, - size_t number_of_spikes, - float forgetting_offset, - float forgetting_offset_local, - size_t dim_x, - size_t dim_y, - size_t dim_xy, - size_t max_threadable_tasks, - float* __restrict__ temp_memory_a, - float* __restrict__ temp_memory_b -) -{ - - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < max_threadable_tasks) - { - float* h_ptr; - float* epsilon_xy_ptr; - int64_t* input_ptr; - - float* temp_memory_ptr_a = temp_memory_a + idx * h_dim; - float* temp_memory_ptr_b = temp_memory_b + idx * h_dim; - - // int pattern_id = idx; - int pattern_id = idx / dim_xy; - int position_xy = idx - (pattern_id * dim_xy); - - // size_t position_x = blockIdx.y; - // size_t position_y = blockIdx.z; - size_t position_x = position_xy / dim_y; - size_t position_y = position_xy - (position_x * dim_y); - - epsilon_xy_ptr = epsilon_xy_pointer + - position_x * epsilon_xy_dim_c1 + position_y; - - h_ptr = h_pointer + - pattern_id * h_dim_c0 + position_x * h_dim_c2 + position_y; - - input_ptr = input_pointer + - pattern_id * input_dim_c0 + position_x * input_dim_c2 + position_y; - - gpu_update_one_ip( - h_init_ptr, - h_ptr, - h_dim_c1, - h_dim, - weights_pointer, - weights_dim_c0, - input_ptr, - input_dim_c1, - epsilon_xy_ptr, - epsilon_xy_dim_c0, - epsilon_t_pointer, - number_of_spikes, - forgetting_offset, - forgetting_offset_local, - temp_memory_ptr_a, - temp_memory_ptr_b - ); - - } - -}; - -// Let's face it... We need a better way to paralelize it... -bool HDynamicCNNManyIP::gpu_update( - float* h_init_ptr, - float* h_pointer, - size_t h_dim_c0, - size_t h_dim_c1, - size_t h_dim_c2, - size_t h_dim, - float* epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - size_t epsilon_xy_dim_c1, - float* epsilon_t_pointer, - float* weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - size_t number_of_spikes, - size_t dim_x, - size_t dim_y, - float forgetting_offset, - float forgetting_offset_local, - size_t number_of_pattern, - size_t gpu_tuning_factor) -{ - - cudaError_t status; - assert((dim_x < 65535)); - assert((dim_y < 65535)); - - // // ////////////////////////////////////// - // // Get infos about the device - // // ////////////////////////////////////// - - // int device; - // cudaDeviceProp prop; - - // status = cudaGetDevice(&device); - // assert((status == cudaSuccess)); - // // std::cout << "Device ID: " << device << std::endl; - - // status = cudaGetDeviceProperties(&prop, device); - // assert((status == cudaSuccess)); - // // std::cout << "Device name: " << prop.name << std::endl; - - // int _cuda_heap_size_in_mb = 16; - // status = cudaDeviceSetLimit(cudaLimitMallocHeapSize, _cuda_heap_size_in_mb * (1 << 20)); - // assert((status == cudaSuccess)); - - // size_t pValue; - // cudaDeviceGetLimit(&pValue, cudaLimitMallocHeapSize); - // std::cout << pValue << " " << (pValue/(2*4*h_dim)) << std::endl; - // exit(1); - - - // ////////////////////////////////////// - // Calculate the distribution on the GPU - // ////////////////////////////////////// - - int min_grid_size; - int block_size; - int grid_size; - - size_t dynamic_s_mem_size = 0; - size_t max_threadable_tasks = number_of_pattern * dim_x * dim_y; - - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator - status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, - (void*)kernel_spike_generation, - dynamic_s_mem_size, max_threadable_tasks); - assert((status == cudaSuccess)); - - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications - // Maximum dimensionality of grid of thread blocks: 3 - // Maximum x -dimension of a grid of thread blocks: (2^31)-1 - // Maximum y- or z-dimension of a grid of thread blocks: 65535 - - // Reduce the automatic block size with our guess - if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < block_size)) - { - block_size = int(gpu_tuning_factor); - } - // Round up according to array size - // (I will separate x and y into other grid dimentsions soon) - // grid_size = (number_of_pattern + block_size - 1) / block_size; - grid_size = (max_threadable_tasks + block_size - 1) / block_size; - - // std::cout << min_grid_size << std::endl; - // std::cout << grid_size << std::endl; - // std::cout << block_size << std::endl; - // std::cout << max_threadable_tasks << std::endl; - - //dim3 grid(grid_size, dim_x, dim_y); - - float* temp_memory_a = nullptr; - status = cudaMalloc((void**)&temp_memory_a, h_dim * max_threadable_tasks * sizeof(float)); - assert((status == cudaSuccess)); - - float* temp_memory_b = nullptr; - status = cudaMalloc((void**)&temp_memory_b, h_dim * max_threadable_tasks * sizeof(float)); - assert((status == cudaSuccess)); - - - //kernel_spike_generation<<>>( - kernel_spike_generation<<>>( - h_init_ptr, - h_pointer, - h_dim_c0, - h_dim_c1, - h_dim_c2, - h_dim, - weights_pointer, - weights_dim_c0, - input_pointer, - input_dim_c0, - input_dim_c1, - input_dim_c2, - epsilon_xy_pointer, - epsilon_xy_dim_c0, - epsilon_xy_dim_c1, - epsilon_t_pointer, - number_of_spikes, - forgetting_offset, - forgetting_offset_local, - dim_x, - dim_y, - (dim_x * dim_y), - //number_of_pattern - max_threadable_tasks, - temp_memory_a, - temp_memory_b - ); - - status = cudaDeviceSynchronize(); - assert((status == cudaSuccess)); - - status = cudaFree(temp_memory_a); - assert((status == cudaSuccess)); - - status = cudaFree(temp_memory_b); - assert((status == cudaSuccess)); - - - return true; -}; - - -void HDynamicCNNManyIP::gpu_occupancy_export( - size_t dim_x, - size_t dim_y, - size_t number_of_pattern, - size_t h_dim, - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1) -{ - return; -}; - -void HDynamicCNNManyIP::gpu_occupancy_import( - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1 -) -{ - return; -}; \ No newline at end of file diff --git a/network/CPP_Cuda/HDynamicCNNManyIP.h b/network/CPP_Cuda/HDynamicCNNManyIP.h deleted file mode 100644 index 2244942..0000000 --- a/network/CPP_Cuda/HDynamicCNNManyIP.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef SRC_HDYNAMICCNNMANYIP_H_ -#define SRC_HDYNAMICCNNMANYIP_H_ - -#include - -#include -#include - -class HDynamicCNNManyIP -{ - public: - HDynamicCNNManyIP(); - ~HDynamicCNNManyIP(); - - bool update_entrypoint( - int64_t h_pointer_addr, - int64_t h_dim_0, - int64_t h_dim_1, - int64_t h_dim_2, - int64_t h_dim_3, - int64_t epsilon_xy_pointer_addr, - int64_t epsilon_xy_dim_0, - int64_t epsilon_xy_dim_1, - int64_t epsilon_xy_dim_2, - int64_t epsilon_t_pointer_addr, - int64_t epsilon_t_dim_0, - int64_t weights_pointer_addr, - int64_t weights_dim_0, - int64_t weights_dim_1, - int64_t input_pointer_addr, - int64_t input_dim_0, - int64_t input_dim_1, - int64_t input_dim_2, - int64_t input_dim_3, - int64_t init_vector_pointer_addr, - int64_t init_vector_dim_0, - int64_t number_of_processes, - float forgetting_offset, - int64_t gpu_tuning_factor); - - void gpu_occupancy_export( - size_t dim_x, - size_t dim_y, - size_t number_of_pattern, - size_t h_dim, - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1); - - void gpu_occupancy_import( - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1); - - private: - - bool update( - float* h_init_ptr, - float* h_pointer, - size_t h_dim_c0, - size_t h_dim_c1, - size_t h_dim_c2, - size_t h_dim, - float* epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - size_t epsilon_xy_dim_c1, - float* epsilon_t_pointer, - float* weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - size_t number_of_spikes, - size_t dim_x, - size_t dim_y, - float forgetting_offset, - float forgetting_offset_local, - size_t pattern_id); - - void update_one_ip( - float* h_init_ptr, - float* h_pointer, - size_t h_dim_c1, - size_t h_dim, - float* weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c1, - float* epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - float* epsilon_t_pointer, - size_t number_of_spikes, - float forgetting_offset, - float forgetting_offset_local); - - bool gpu_update( - float* h_init_ptr, - float* h_pointer, - size_t h_dim_c0, - size_t h_dim_c1, - size_t h_dim_c2, - size_t h_dim, - float* epsilon_xy_pointer, - size_t epsilon_xy_dim_c0, - size_t epsilon_xy_dim_c1, - float* epsilon_t_pointer, - float* weights_pointer, - size_t weights_dim_c0, - int64_t* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - size_t number_of_spikes, - size_t dim_x, - size_t dim_y, - float forgetting_offset, - float forgetting_offset_local, - size_t number_of_pattern, - size_t gpu_tuning_factor); - - -}; - -#endif /* SRC_HDYNAMICCNNMANYIP_H_ */ \ No newline at end of file diff --git a/network/CPP_Cuda/Makefile b/network/CPP_Cuda/Makefile deleted file mode 100644 index 28b0600..0000000 --- a/network/CPP_Cuda/Makefile +++ /dev/null @@ -1,67 +0,0 @@ -# Change to your python bin directory (tested with Python 3.10.4) -PYBIN=~/P3.10GPU/bin/ -NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler -CC=/usr/lib64/ccache/clang++ - -PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes` -PARAMETERS_O= -O3 -std=c++14 $(PYBIND11INCLUDE) -ccbin=$(CC) \ - -Xcompiler "-fPIC -Wall -fopenmp=libomp" - -PARAMETERS_Linker=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall" - -PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix` - - -all: PyHDynamicCNNManyIP \ - PySpikeGeneration2DManyIP \ - PyMultiApp - -####################### - -HDynamicCNNManyIP.o: HDynamicCNNManyIP.h HDynamicCNNManyIP.cu - $(NVCC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cu -o HDynamicCNNManyIP.o - -PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp - $(NVCC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o - -PyHDynamicCNNManyIP: HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o - $(NVCC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o - cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX) - $(PYBIN)python3 pybind11_auto_pyi.py - -####################### - -SpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h SpikeGeneration2DManyIP.cu - $(NVCC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cu -o SpikeGeneration2DManyIP.o - -PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp - $(NVCC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o - -PySpikeGeneration2DManyIP: SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o - $(NVCC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o - cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX) - $(PYBIN)python3 pybind11_auto_pyi.py - - -####################### - -MultiApp.o: MultiApp.h MultiApp.cu approximation_multiplication_function.cpp \ - gpu_approximation_multiplication_function.cu error_term.cpp gpu_error_term.cu - $(NVCC) $(PARAMETERS_O) -c MultiApp.cu -o MultiApp.o - -PyMultiApp.o: MultiApp.h PyMultiApp.cpp - $(NVCC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o - -PyMultiApp: MultiApp.o PyMultiApp.o - $(NVCC) $(PARAMETERS_Linker) -o PyMultiApp MultiApp.o PyMultiApp.o - cp PyMultiApp PyMultiApp$(PYPOSTFIX) - $(PYBIN)python3 pybind11_auto_pyi.py - -####################### -clean: - rm -f PyHDynamicCNNManyIP - rm -f PySpikeGeneration2DManyIP - rm -f PyMultiApp - rm -f *.o - rm -f *.so - diff --git a/network/CPP_Cuda/MultiApp.cu b/network/CPP_Cuda/MultiApp.cu deleted file mode 100644 index b7a3cec..0000000 --- a/network/CPP_Cuda/MultiApp.cu +++ /dev/null @@ -1,313 +0,0 @@ -#include "MultiApp.h" - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "approximation_multiplication_function.cpp" -#include "gpu_approximation_multiplication_function.cu" - -MultiApp::MultiApp() -{ - -}; - -MultiApp::~MultiApp() -{ - -}; - -bool MultiApp::update(float* np_input_pointer, - float* np_weight_pointer, - float* np_output_pointer, int64_t pattern_dim, - int64_t feature_dim, int64_t x_dim, int64_t y_dim, - int64_t input_channel_dim, int64_t id_pattern, - bool approximation_enable, int64_t number_of_trunc_bits, - int64_t number_of_frac_bits) -{ - - assert((id_pattern >= 0)); - assert((id_pattern < pattern_dim)); - - float* np_input_pointer_pattern; - float* np_output_pointer_pattern; - - float* input_ptr; - float* output_ptr; - float* w_ptr; - - uint64_t pattern_size = input_channel_dim; - - std::vector ap_h_vector; - ap_h_vector.resize(pattern_size); - float* ap_h_ptr = ap_h_vector.data(); - - std::vector ap_x_vector; - ap_x_vector.resize(pattern_size); - uint32_t* ap_x_ptr = ap_x_vector.data(); - - std::vector ap_y_vector; - ap_y_vector.resize(pattern_size); - uint32_t* ap_y_ptr = ap_y_vector.data(); - - std::vector ap_x_exponent_vector; - ap_x_exponent_vector.resize(pattern_size); - uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data(); - - std::vector ap_y_exponent_vector; - ap_y_exponent_vector.resize(pattern_size); - uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data(); - - std::vector ap_h_exponent_vector; - ap_h_exponent_vector.resize(pattern_size); - uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data(); - - std::vector ap_res_vector; - ap_res_vector.resize(pattern_size); - uint64_t* ap_res_ptr = ap_res_vector.data(); - - uint32_t ap_mask = static_cast(pow(2, number_of_trunc_bits)) - 1; - - std::vector sign_temp_vector; - sign_temp_vector.resize(pattern_size); - uint32_t* sign_temp_ptr = sign_temp_vector.data(); - - uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim; - uint64_t output_pattern_size = feature_dim * x_dim * y_dim; - - np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size; - np_output_pointer_pattern = - np_output_pointer + id_pattern * output_pattern_size; - - uint64_t counter; - - uint64_t counter_x; - uint64_t counter_y; - uint64_t counter_feature; - uint64_t pos_xy; - uint64_t pos_xy_if; - - float temp_sum; - - uint64_t pattern_c_2 = x_dim * y_dim; - - for (counter_x = 0; counter_x < x_dim; counter_x++) - { - for (counter_y = 0; counter_y < y_dim; counter_y++) - { - pos_xy = counter_y + counter_x * y_dim; - for (counter_feature = 0; counter_feature < feature_dim; - counter_feature++) - { - pos_xy_if = counter_feature * pattern_c_2 + pos_xy; - - input_ptr = np_input_pointer_pattern + pos_xy; - output_ptr = np_output_pointer_pattern + pos_xy_if; - w_ptr = np_weight_pointer + counter_feature * input_channel_dim; - -#pragma omp simd - for (counter = 0; counter < pattern_size; counter++) - { - ap_h_ptr[counter] = input_ptr[counter * pattern_c_2]; - } - - approximation_multiplication_function( - ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits, - number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr, - ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr, - sign_temp_ptr, approximation_enable); - - temp_sum = 0.0; -#pragma omp simd reduction(+ \ - : temp_sum) - for (counter = 0; counter < pattern_size; counter++) - { - temp_sum += ap_h_ptr[counter]; - } - - output_ptr[0] = temp_sum; - } - } - } - - return true; -}; - -bool MultiApp::update_with_init_vector_multi_pattern( - int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr, - int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim, - int64_t x_dim, int64_t y_dim, int64_t input_channel_dim, - int64_t number_of_processes, bool approximation_enable, - int64_t number_of_trunc_bits, int64_t number_of_frac) -{ - - - - int64_t number_of_pattern = pattern_dim; - int64_t pattern_id; - - float* np_input_pointer = (float*)np_input_pointer_addr; - float* np_weight_pointer = (float*)np_weight_pointer_addr; - float* np_output_pointer = (float*)np_output_pointer_addr; - - assert((np_input_pointer != nullptr)); - assert((np_output_pointer != nullptr)); - assert((np_weight_pointer != nullptr)); - - assert((pattern_dim > 0)); - assert((feature_dim > 0)); - assert((x_dim > 0)); - assert((y_dim > 0)); - assert((input_channel_dim > 0)); - - if (number_of_processes > 0) - { - omp_set_num_threads(number_of_processes); - // For debugging: Only one thread - // omp_set_num_threads(1); - -#pragma omp parallel for - for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) - { - - update(np_input_pointer, np_weight_pointer, - np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim, - input_channel_dim, pattern_id, approximation_enable, - number_of_trunc_bits, number_of_frac); - } - } - else - { - update_gpu(np_input_pointer, np_weight_pointer, - np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim, - input_channel_dim, approximation_enable, - number_of_trunc_bits, number_of_frac); - } - return true; -}; - -__global__ void kernel_approx_multiplication(float* __restrict__ input_pointer, float* __restrict__ weight_pointer, - float* __restrict__ output_pointer, uint64_t pattern_dim, - uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, - uint64_t input_channel_dim, size_t max_threadable_tasks, - uint64_t input_index_scale, uint64_t number_of_frac_bits, - bool approximation_enable, uint64_t number_of_trunc_bits, - uint32_t ap_mask) -{ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < max_threadable_tasks) - { - int pattern_id = idx / feature_dim; - int feature_id = idx - (pattern_id * feature_dim); - int x_id = blockIdx.y; - int y_id = blockIdx.z; - - float* weight_pointer_sub = weight_pointer + feature_id * input_channel_dim; - float* input_pointer_sub = input_pointer + pattern_id * input_channel_dim * x_dim * y_dim + x_id * y_dim + y_id; - float* output_pointer_sub = output_pointer + - pattern_id * feature_dim * x_dim * y_dim + - feature_id * x_dim * y_dim + x_id * y_dim + y_id; - *output_pointer_sub = 0.0; - - size_t counter; - for (counter = 0; counter < input_channel_dim; counter++) - { - *output_pointer_sub += gpu_approximation_multiplication_function( - weight_pointer_sub[counter], - input_pointer_sub[counter * input_index_scale], - number_of_frac_bits, approximation_enable, - number_of_trunc_bits, ap_mask); - } - } -}; - -bool MultiApp::update_gpu(float* np_input_pointer, - float* np_weight_pointer, - float* np_output_pointer, uint64_t pattern_dim, - uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, - uint64_t input_channel_dim, - bool approximation_enable, uint64_t number_of_trunc_bits, - uint64_t number_of_frac_bits) -{ - - uint32_t ap_mask = static_cast(pow(2, number_of_trunc_bits)) - 1; - // std::cout << approximation_enable << std::endl; - // std::cout << number_of_trunc_bits << std::endl; - // std::cout << number_of_frac_bits << std::endl; - - cudaError_t status; - assert((x_dim < 65535)); - assert((y_dim < 65535)); - - // ////////////////////////////////////// - // Get infos about the device - // ////////////////////////////////////// - - int device; - cudaDeviceProp prop; - - status = cudaGetDevice(&device); - assert((status == cudaSuccess)); - // std::cout << "Device ID: " << device << std::endl; - - status = cudaGetDeviceProperties(&prop, device); - assert((status == cudaSuccess)); - // std::cout << "Device name: " << prop.name << std::endl; - - // ////////////////////////////////////// - // Calculate the distribution on the GPU - // ////////////////////////////////////// - - int min_grid_size; - int block_size; - int grid_size; - - size_t dynamic_s_mem_size = 0; - size_t max_threadable_tasks = pattern_dim * feature_dim * x_dim * y_dim; - - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator - status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, - (void*)kernel_approx_multiplication, - dynamic_s_mem_size, max_threadable_tasks); - assert((status == cudaSuccess)); - - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications - // Maximum dimensionality of grid of thread blocks: 3 - // Maximum x -dimension of a grid of thread blocks: (2^31)-1 - // Maximum y- or z-dimension of a grid of thread blocks: 65535 - - // Round up according to array size - grid_size = ((pattern_dim * feature_dim) + block_size - 1) / block_size; - - // std::cout << min_grid_size << std::endl; - // std::cout << grid_size << std::endl; - // std::cout << block_size << std::endl; - // std::cout << max_threadable_tasks << std::endl; - - dim3 grid(grid_size, x_dim, y_dim); - - kernel_approx_multiplication<<>>(np_input_pointer, - np_weight_pointer, - np_output_pointer, - pattern_dim, - feature_dim, - x_dim, - y_dim, - input_channel_dim, - (pattern_dim * feature_dim), - (x_dim * y_dim), - number_of_frac_bits, - approximation_enable, - number_of_trunc_bits, - ap_mask); - - cudaDeviceSynchronize(); - return true; -}; diff --git a/network/CPP_Cuda/MultiApp.h b/network/CPP_Cuda/MultiApp.h deleted file mode 100644 index 8506918..0000000 --- a/network/CPP_Cuda/MultiApp.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef SRC_MultiApp_H_ -#define SRC_MultiApp_H_ - -#include - -#include -#include - -class MultiApp -{ -public: - MultiApp(); - ~MultiApp(); - - bool update(float *np_input_pointer, float *np_weight_pointer, - float *np_output_pointer, int64_t pattern_dim, - int64_t feature_dim, int64_t x_dim, int64_t y_dim, - int64_t input_channel_dim, int64_t id_pattern, - bool approximation_enable, int64_t number_of_trunc_bits, - int64_t number_of_frac); - - bool update_gpu(float *input_pointer, float *weight_pointer, - float *output_pointer, uint64_t pattern_dim, - uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim, - uint64_t input_channel_dim, - bool approximation_enable, uint64_t number_of_trunc_bits, - uint64_t number_of_frac); - - bool update_with_init_vector_multi_pattern( - int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr, - int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim, - int64_t x_dim, int64_t y_dim, int64_t input_channel_dim, - int64_t number_of_processes, bool approximation_enable, - int64_t number_of_trunc_bits, int64_t number_of_frac); - -private: -}; - -#endif /* SRC_MultiApp_H_ */ \ No newline at end of file diff --git a/network/CPP_Cuda/PyHDynamicCNNManyIP.cpp b/network/CPP_Cuda/PyHDynamicCNNManyIP.cpp deleted file mode 100644 index 469a4a8..0000000 --- a/network/CPP_Cuda/PyHDynamicCNNManyIP.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include - -#include "HDynamicCNNManyIP.h" - -namespace py = pybind11; - -PYBIND11_MODULE(PyHDynamicCNNManyIP, m) -{ - m.doc() = "HDynamicCNNManyIP Module"; - py::class_(m, "HDynamicCNNManyIP") - .def(py::init<>()) - .def("gpu_occupancy_export", - &HDynamicCNNManyIP::gpu_occupancy_export) - .def("gpu_occupancy_import", - &HDynamicCNNManyIP::gpu_occupancy_import) - .def("update", - &HDynamicCNNManyIP::update_entrypoint); -} \ No newline at end of file diff --git a/network/CPP_Cuda/PyHDynamicCNNManyIP.pyi b/network/CPP_Cuda/PyHDynamicCNNManyIP.pyi deleted file mode 100644 index 7032d50..0000000 --- a/network/CPP_Cuda/PyHDynamicCNNManyIP.pyi +++ /dev/null @@ -1,18 +0,0 @@ -# -# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! -# - -"""HDynamicCNNManyIP Module""" -from __future__ import annotations -import PyHDynamicCNNManyIP -import typing - -__all__ = [ - "HDynamicCNNManyIP" -] - - -class HDynamicCNNManyIP(): - def __init__(self) -> None: ... - def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ... - pass diff --git a/network/CPP_Cuda/PyMultiApp.cpp b/network/CPP_Cuda/PyMultiApp.cpp deleted file mode 100644 index f1d8798..0000000 --- a/network/CPP_Cuda/PyMultiApp.cpp +++ /dev/null @@ -1,14 +0,0 @@ - -#include - -#include "MultiApp.h" - -namespace py = pybind11; - -PYBIND11_MODULE(PyMultiApp, m) { - m.doc() = "MultiApp Module"; - py::class_(m, "MultiApp") - .def(py::init<>()) - .def("update_entrypoint", - &MultiApp::update_with_init_vector_multi_pattern); -} \ No newline at end of file diff --git a/network/CPP_Cuda/PyMultiApp.pyi b/network/CPP_Cuda/PyMultiApp.pyi deleted file mode 100644 index 54768cc..0000000 --- a/network/CPP_Cuda/PyMultiApp.pyi +++ /dev/null @@ -1,18 +0,0 @@ -# -# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! -# - -"""MultiApp Module""" -from __future__ import annotations -import PyMultiApp -import typing - -__all__ = [ - "MultiApp" -] - - -class MultiApp(): - def __init__(self) -> None: ... - def update_with_init_vector_multi_pattern(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ... - pass diff --git a/network/CPP_Cuda/PySpikeGeneration2DManyIP.cpp b/network/CPP_Cuda/PySpikeGeneration2DManyIP.cpp deleted file mode 100644 index b338d23..0000000 --- a/network/CPP_Cuda/PySpikeGeneration2DManyIP.cpp +++ /dev/null @@ -1,19 +0,0 @@ - -#include - -#include "SpikeGeneration2DManyIP.h" - -namespace py = pybind11; - -PYBIND11_MODULE(PySpikeGeneration2DManyIP, m) -{ - m.doc() = "SpikeGeneration2DManyIP Module"; - py::class_(m, "SpikeGeneration2DManyIP") - .def(py::init<>()) - .def("gpu_occupancy_export", - &SpikeGeneration2DManyIP::gpu_occupancy_export) - .def("gpu_occupancy_import", - &SpikeGeneration2DManyIP::gpu_occupancy_import) - .def("spike_generation", - &SpikeGeneration2DManyIP::spike_generation_entrypoint); -} diff --git a/network/CPP_Cuda/PySpikeGeneration2DManyIP.pyi b/network/CPP_Cuda/PySpikeGeneration2DManyIP.pyi deleted file mode 100644 index 7c2257d..0000000 --- a/network/CPP_Cuda/PySpikeGeneration2DManyIP.pyi +++ /dev/null @@ -1,18 +0,0 @@ -# -# AUTOMATICALLY GENERATED FILE, DO NOT EDIT! -# - -"""SpikeGeneration2DManyIP Module""" -from __future__ import annotations -import PySpikeGeneration2DManyIP -import typing - -__all__ = [ - "SpikeGeneration2DManyIP" -] - - -class SpikeGeneration2DManyIP(): - def __init__(self) -> None: ... - def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ... - pass diff --git a/network/CPP_Cuda/SpikeGeneration2DManyIP.cu b/network/CPP_Cuda/SpikeGeneration2DManyIP.cu deleted file mode 100644 index 102448d..0000000 --- a/network/CPP_Cuda/SpikeGeneration2DManyIP.cu +++ /dev/null @@ -1,411 +0,0 @@ -#include "SpikeGeneration2DManyIP.h" - -#include -#include -#include - -#include -#include -#include - - -SpikeGeneration2DManyIP::SpikeGeneration2DManyIP() -{ - -}; - -SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP() -{ - -}; - -bool SpikeGeneration2DManyIP::spike_generation_entrypoint( - int64_t input_pointer_addr, int64_t input_dim_0, - int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3, - int64_t random_values_pointer_addr, int64_t random_values_dim_0, - int64_t random_values_dim_1, int64_t random_values_dim_2, - int64_t random_values_dim_3, int64_t output_pointer_addr, - int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2, - int64_t output_dim_3, int64_t number_of_cpu_processes) -{ - - float* input_pointer = (float*)input_pointer_addr; - float* random_values_pointer = (float*)random_values_pointer_addr; - int64_t* output_pointer = (int64_t*)output_pointer_addr; - - // Input - assert((input_pointer != nullptr)); - assert((input_dim_0 > 0)); - assert((input_dim_1 > 0)); - assert((input_dim_2 > 0)); - assert((input_dim_3 > 0)); - - // Random - assert((random_values_pointer != nullptr)); - assert((random_values_dim_0 > 0)); - assert((random_values_dim_1 > 0)); - assert((random_values_dim_2 > 0)); - assert((random_values_dim_3 > 0)); - - // Output - assert((output_pointer != nullptr)); - assert((output_dim_0 > 0)); - assert((output_dim_1 > 0)); - assert((output_dim_2 > 0)); - assert((output_dim_3 > 0)); - - // Input - size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3; - size_t input_dim_c1 = input_dim_2 * input_dim_3; - size_t input_dim_c2 = input_dim_3; - - // Random - size_t random_values_dim_c0 = - random_values_dim_1 * random_values_dim_2 * random_values_dim_3; - size_t random_values_dim_c1 = - random_values_dim_2 * random_values_dim_3; - size_t random_values_dim_c2 = random_values_dim_3; - - // Output - size_t output_dim_c0 = - output_dim_1 * output_dim_2 * output_dim_3; - size_t output_dim_c1 = output_dim_2 * output_dim_3; - size_t output_dim_c2 = output_dim_3; - - size_t number_of_pattern = input_dim_0; - size_t h_dim = input_dim_1; - size_t spike_dim = output_dim_1; - size_t x_dim = output_dim_2; - size_t y_dim = output_dim_2; - - if (number_of_cpu_processes > 0) - { - - omp_set_num_threads(number_of_cpu_processes); - // DEBUG: - // omp_set_num_threads(1); - - size_t pattern_id; - -#pragma omp parallel for - for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++) - { - spike_generation( - input_pointer, - input_dim_c0, - input_dim_c1, - input_dim_c2, - random_values_pointer, - random_values_dim_c0, - random_values_dim_c1, - random_values_dim_c2, - output_pointer, - output_dim_c0, - output_dim_c1, - output_dim_c2, - x_dim, - y_dim, - spike_dim, - h_dim, - pattern_id); - } - } - else - { - gpu_spike_generation( - input_pointer, - input_dim_c0, - input_dim_c1, - input_dim_c2, - random_values_pointer, - random_values_dim_c0, - random_values_dim_c1, - random_values_dim_c2, - output_pointer, - output_dim_c0, - output_dim_c1, - output_dim_c2, - x_dim, - y_dim, - spike_dim, - h_dim, - number_of_pattern); - } - - return true; -}; - -bool SpikeGeneration2DManyIP::spike_generation( - float* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - float* random_values_pointer, - size_t random_values_dim_c0, - size_t random_values_dim_c1, - size_t random_values_dim_c2, - int64_t* output_pointer, - size_t output_dim_c0, - size_t output_dim_c1, - size_t output_dim_c2, - size_t x_dim, - size_t y_dim, - size_t spike_dim, - size_t h_dim, - size_t pattern_id) -{ - - size_t counter; - size_t counter_x = 0; - size_t counter_y = 0; - - float* p_ptr = nullptr; - int64_t* out_ptr = nullptr; - float* rand_ptr = nullptr; - - for (counter_x = 0; counter_x < x_dim; counter_x++) - { - for (counter_y = 0; counter_y < y_dim; counter_y++) - { - p_ptr = input_pointer + pattern_id * input_dim_c0 + - counter_x * input_dim_c2 + counter_y; - // + counter * input_dim_c1 - - out_ptr = output_pointer + pattern_id * output_dim_c0 + - counter_x * output_dim_c2 + counter_y; - // + counter * output_dim_c1 - - rand_ptr = random_values_pointer + - pattern_id * random_values_dim_c0 + - counter_x * random_values_dim_c2 + counter_y; - // + counter * random_values_dim_c1 - - for (counter = 0; counter < spike_dim; counter++) - { - out_ptr[counter * output_dim_c1] = lower_bound(p_ptr, - h_dim, - input_dim_c1, - rand_ptr[counter * random_values_dim_c1]); - } - } - } - - return true; -}; - -// algorithmic idea stolen from libc++ -size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr, - size_t data_length, - size_t data_ptr_stride, - float compare_to_value) -{ - - size_t start_of_range = 0; - size_t length_of_range = data_length; - - while (length_of_range != 0) - { - size_t half_length = length_of_range >> 1; - size_t actual_position = start_of_range + half_length; - - if (data_ptr[actual_position * data_ptr_stride] < compare_to_value) - { - start_of_range = ++actual_position; - length_of_range -= half_length + 1; - } - else - length_of_range = half_length; - } - return start_of_range; -}; - -__device__ size_t gpu_lower_bound(float* __restrict__ data_ptr, - size_t data_length, - size_t data_ptr_stride, - float compare_to_value) -{ - - size_t start_of_range = 0; - size_t length_of_range = data_length; - - while (length_of_range != 0) - { - size_t half_length = length_of_range >> 1; - size_t actual_position = start_of_range + half_length; - - if (data_ptr[actual_position * data_ptr_stride] < compare_to_value) - { - start_of_range = ++actual_position; - length_of_range -= half_length + 1; - } - else - length_of_range = half_length; - } - return start_of_range; -}; - -__global__ void kernel_spike_generation( - float* __restrict__ input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - float* __restrict__ random_values_pointer, - size_t random_values_dim_c0, - size_t random_values_dim_c1, - size_t random_values_dim_c2, - int64_t* __restrict__ output_pointer, - size_t output_dim_c0, - size_t output_dim_c1, - size_t output_dim_c2, - size_t x_dim, - size_t y_dim, - size_t spike_dim, - size_t h_dim, - size_t max_threadable_tasks) -{ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < max_threadable_tasks) - { - - size_t pattern_id = idx / spike_dim; - size_t position_spike = idx - (pattern_id * spike_dim); - - size_t position_x = blockIdx.y; - size_t position_y = blockIdx.z; - - float* p_ptr = input_pointer + pattern_id * input_dim_c0 + - position_x * input_dim_c2 + position_y; - - int64_t* out_ptr = output_pointer + pattern_id * output_dim_c0 + - position_x * output_dim_c2 + position_y - + position_spike * output_dim_c1; - - float* rand_ptr = random_values_pointer + - pattern_id * random_values_dim_c0 + - position_x * random_values_dim_c2 + position_y - + position_spike * random_values_dim_c1; - - *out_ptr = gpu_lower_bound(p_ptr, - h_dim, - input_dim_c1, - *rand_ptr); - } -}; - -bool SpikeGeneration2DManyIP::gpu_spike_generation( - float* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - float* random_values_pointer, - size_t random_values_dim_c0, - size_t random_values_dim_c1, - size_t random_values_dim_c2, - int64_t* output_pointer, - size_t output_dim_c0, - size_t output_dim_c1, - size_t output_dim_c2, - size_t x_dim, - size_t y_dim, - size_t spike_dim, - size_t h_dim, - size_t number_of_pattern) -{ - cudaError_t status; - assert((x_dim < 65535)); - assert((y_dim < 65535)); - - // // ////////////////////////////////////// - // // Get infos about the device - // // ////////////////////////////////////// - - // int device; - // cudaDeviceProp prop; - - // status = cudaGetDevice(&device); - // assert((status == cudaSuccess)); - // // std::cout << "Device ID: " << device << std::endl; - - // status = cudaGetDeviceProperties(&prop, device); - // assert((status == cudaSuccess)); - // // std::cout << "Device name: " << prop.name << std::endl; - - // ////////////////////////////////////// - // Calculate the distribution on the GPU - // ////////////////////////////////////// - - int min_grid_size; - int block_size; - int grid_size; - - size_t dynamic_s_mem_size = 0; - size_t max_threadable_tasks = number_of_pattern * spike_dim * x_dim * y_dim; - - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator - status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, - (void*)kernel_spike_generation, - dynamic_s_mem_size, max_threadable_tasks); - assert((status == cudaSuccess)); - - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications - // Maximum dimensionality of grid of thread blocks: 3 - // Maximum x -dimension of a grid of thread blocks: (2^31)-1 - // Maximum y- or z-dimension of a grid of thread blocks: 65535 - - // Round up according to array size - // (I will separate x and y into other grid dimentsions soon) - grid_size = ((number_of_pattern * spike_dim) + block_size - 1) / block_size; - - // std::cout << min_grid_size << std::endl; - // std::cout << grid_size << std::endl; - // std::cout << block_size << std::endl; - // std::cout << max_threadable_tasks << std::endl; - - dim3 grid(grid_size, x_dim, y_dim); - - - kernel_spike_generation<<>>( - input_pointer, - input_dim_c0, - input_dim_c1, - input_dim_c2, - random_values_pointer, - random_values_dim_c0, - random_values_dim_c1, - random_values_dim_c2, - output_pointer, - output_dim_c0, - output_dim_c1, - output_dim_c2, - x_dim, - y_dim, - spike_dim, - h_dim, - (number_of_pattern * spike_dim)); - - cudaDeviceSynchronize(); - - return true; -}; - - -void SpikeGeneration2DManyIP::gpu_occupancy_export( - size_t dim_x, - size_t dim_y, - size_t number_of_pattern, - size_t spike_dim, - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1) -{ - return; -}; - -void SpikeGeneration2DManyIP::gpu_occupancy_import( - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1) -{ - return; -}; \ No newline at end of file diff --git a/network/CPP_Cuda/SpikeGeneration2DManyIP.h b/network/CPP_Cuda/SpikeGeneration2DManyIP.h deleted file mode 100644 index 62e8430..0000000 --- a/network/CPP_Cuda/SpikeGeneration2DManyIP.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef SRC_SPIKEGENERATION2DMANYIP_H_ -#define SRC_SPIKEGENERATION2DMANYIP_H_ - -#include - -#include -#include - -class SpikeGeneration2DManyIP -{ - public: - SpikeGeneration2DManyIP(); - ~SpikeGeneration2DManyIP(); - - bool spike_generation_entrypoint( - int64_t input_pointer_addr, - int64_t input_dim_0, - int64_t input_dim_1, - int64_t input_dim_2, - int64_t input_dim_3, - int64_t random_values_pointer_addr, - int64_t random_values_dim_0, - int64_t random_values_dim_1, - int64_t random_values_dim_2, - int64_t random_values_dim_3, - int64_t output_pointer_addr, - int64_t output_dim_0, - int64_t output_dim_1, - int64_t output_dim_2, - int64_t output_dim_3, - int64_t number_of_cpu_processes); - - bool spike_generation( - float* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - float* random_values_pointer, - size_t random_values_dim_c0, - size_t random_values_dim_c1, - size_t random_values_dim_c2, - int64_t* output_pointer, - size_t output_dim_c0, - size_t output_dim_c1, - size_t output_dim_c2, - size_t x_dim, - size_t y_dim, - size_t spike_dim, - size_t h_dim, - size_t pattern_id); - - bool gpu_spike_generation( - float* input_pointer, - size_t input_dim_c0, - size_t input_dim_c1, - size_t input_dim_c2, - float* random_values_pointer, - size_t random_values_dim_c0, - size_t random_values_dim_c1, - size_t random_values_dim_c2, - int64_t* output_pointer, - size_t output_dim_c0, - size_t output_dim_c1, - size_t output_dim_c2, - size_t x_dim, - size_t y_dim, - size_t spike_dim, - size_t h_dim, - size_t number_of_pattern); - - void gpu_occupancy_export( - size_t dim_x, - size_t dim_y, - size_t number_of_pattern, - size_t spike_dim, - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1); - - void gpu_occupancy_import( - int64_t setting_memory_addr, - size_t setting_dim_0, - size_t setting_dim_1); - - private: - size_t lower_bound(float* data_ptr, size_t data_length, - size_t data_ptr_stride, - float compare_to_value); -}; - -#endif /* SRC_SPIKEGENERATION2DMANYIP_H_ */ \ No newline at end of file diff --git a/network/CPP_Cuda/approximation_multiplication_function.cpp b/network/CPP_Cuda/approximation_multiplication_function.cpp deleted file mode 100644 index 6b54c98..0000000 --- a/network/CPP_Cuda/approximation_multiplication_function.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include - -#include -#include -#include - -#include "error_term.cpp" - -// Best way to plot the bits -// std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n"; - -// The result needs to be written back into h_pointer (which contains h) -// Don't write to w_pointer. -void approximation_multiplication_function( - float *h_pointer, float *w_pointer, int64_t pattern_length, - uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits, - uint32_t *ap_x_ptr, uint32_t *ap_y_ptr, uint32_t *ap_x_exponent_ptr, - uint32_t *ap_y_exponent_ptr, uint32_t *ap_h_exponent_ptr, uint32_t ap_mask, - uint64_t *ap_res_ptr, uint32_t *sign_temp_ptr, bool approximation_enable) { - uint64_t counter; - - uint32_t *w_pointer_mod = (uint32_t *)w_pointer; - uint32_t *h_pointer_mod = (uint32_t *)h_pointer; - -// Calculate the new sign -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^ - (h_pointer_mod[counter] & 0x80000000); - } - -// Extract the exponent -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24; - } -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24; - } - - // Cast and "normalize" - uint64_t shift_value = 32 - number_of_frac_bits; -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_x_ptr[counter] = - ((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value; - } - -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_y_ptr[counter] = - ((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value; - } - -// Make the zero -g-r-e-a-t- correct again -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - if (h_pointer[counter] == 0) { - ap_x_ptr[counter] = 0; - } - } - -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - if (w_pointer[counter] == 0) { - ap_y_ptr[counter] = 0; - } - } - -// res = x*y -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_res_ptr[counter] = static_cast(ap_x_ptr[counter]) * static_cast(ap_y_ptr[counter]); - } - - uint32_t temp; - if (approximation_enable == true){ - // Go through the vector values - for (counter = 0; counter < pattern_length; counter++) { - temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask, - number_of_trunc_bits); - if (temp > ap_res_ptr[counter]) { - ap_res_ptr[counter] = 0; - } else { - ap_res_ptr[counter] -= temp; - } - } - } -// Cast from int to float -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - h_pointer[counter] = static_cast(ap_res_ptr[counter]); - } - -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24; - } - -// devide by the 2^number_of_frac_bits -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits; - } - -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] + - ap_h_exponent_ptr[counter]; - if (temp > 252) { - ap_h_exponent_ptr[counter] = temp - 252; - } else { - // Here I try to catch the case that the new exponent is too small - ap_h_exponent_ptr[counter] = 0; - } - } - -// Remove the old exponent -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9; - } - -// Install the new exponent -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23; - } - -// Add the sign back -#pragma omp simd - for (counter = 0; counter < pattern_length; counter++) { - h_pointer_mod[counter] += sign_temp_ptr[counter]; - } - - return; -} diff --git a/network/CPP_Cuda/error_term.cpp b/network/CPP_Cuda/error_term.cpp deleted file mode 100644 index f683d1b..0000000 --- a/network/CPP_Cuda/error_term.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include - -#include -#include - -uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask, - uint32_t number_of_trunc_bits) { - uint32_t error_value = 0; - - uint32_t temp_shift_a = a; - uint32_t temp_shift_b = b & ap_mask; - - uint32_t counter_trunc; - uint32_t temp; - - // Go through the bits - for (counter_trunc = 0; counter_trunc < number_of_trunc_bits; - counter_trunc++) { - temp = temp_shift_a & 1; - if (temp == 1) { - error_value += temp_shift_b & ap_mask; - } - temp_shift_a >>= 1; - temp_shift_b <<= 1; - } - - return error_value; -} diff --git a/network/CPP_Cuda/gpu_approximation_multiplication_function.cu b/network/CPP_Cuda/gpu_approximation_multiplication_function.cu deleted file mode 100644 index 7e145e8..0000000 --- a/network/CPP_Cuda/gpu_approximation_multiplication_function.cu +++ /dev/null @@ -1,102 +0,0 @@ -#include "gpu_error_term.cu" - -__device__ float gpu_approximation_multiplication_function( - float weight, - float input, - uint64_t number_of_frac_bits, - bool approximation_enable, - uint64_t number_of_trunc_bits, - uint32_t ap_mask) -{ - - float weight_copy = weight; - float input_copy = input; - - uint32_t *weight_pointer_mod = (uint32_t *)&weight_copy; - uint32_t *input_pointer_mod = (uint32_t *)&input_copy; - - // Calculate the new sign - uint32_t sign_temp = (*weight_pointer_mod & 0x80000000) ^ - (*input_pointer_mod & 0x80000000); - - // Extract the exponent - uint32_t ap_input_exponent = (*input_pointer_mod << 1) >> 24; - uint32_t ap_weight_exponent = (*weight_pointer_mod << 1) >> 24; - - // Cast and "normalize" - uint64_t shift_value = 32 - number_of_frac_bits; - - uint32_t ap_input_mantissa = - ((*input_pointer_mod << 8) | 0x80000000) >> shift_value; - - uint32_t ap_weight_mantissa = - ((*weight_pointer_mod << 8) | 0x80000000) >> shift_value; - - // Make the zero -g-r-e-a-t- correct again - if (input == 0) - { - ap_input_mantissa = 0; - } - - if (weight == 0) - { - ap_weight_mantissa = 0; - } - - // res = x*y - uint64_t ap_result = static_cast(ap_input_mantissa) * static_cast(ap_weight_mantissa); - - uint32_t temp; - // -------------------------------------------- - // Approx - // -------------------------------------------- - - if (approximation_enable == true) - { - // Go through the vector values - temp = gpu_error_term(ap_weight_mantissa, ap_input_mantissa, ap_mask, - number_of_trunc_bits); - if (temp > ap_result) - { - ap_result = 0; - } - else - { - ap_result -= temp; - } - } - - // Cast from int to float - float output = static_cast(ap_result); - if (ap_result == 0) - { - output = 0.0; - } - else - { - uint32_t *output_pointer_mod = (uint32_t *)&output; - - uint32_t ap_output_exponent = (*output_pointer_mod << 1) >> 24; - ap_output_exponent -= 2 * number_of_frac_bits; - temp = ap_input_exponent + ap_weight_exponent + ap_output_exponent; - if (temp > 252) - { - ap_output_exponent = temp - 252; - } - else - { - // Here I try to catch the case that the new exponent is too small - ap_output_exponent = 0; - } - - // Remove the old exponent - *output_pointer_mod = (*output_pointer_mod << 9) >> 9; - - // Install the new exponent - *output_pointer_mod += ap_output_exponent << 23; - - // Add the sign back - *output_pointer_mod += sign_temp; - } - return output; -}; \ No newline at end of file diff --git a/network/CPP_Cuda/gpu_error_term.cu b/network/CPP_Cuda/gpu_error_term.cu deleted file mode 100644 index 23bdc1b..0000000 --- a/network/CPP_Cuda/gpu_error_term.cu +++ /dev/null @@ -1,28 +0,0 @@ -__device__ uint32_t gpu_error_term(uint32_t ap_weight_mantissa, - uint32_t ap_input_mantissa, - uint32_t ap_mask, - uint32_t number_of_trunc_bits) -{ - uint32_t error_value = 0; - - uint32_t temp_shift_a = ap_weight_mantissa; - uint32_t temp_shift_b = ap_input_mantissa & ap_mask; - - uint32_t counter_trunc; - uint32_t temp; - - // Go through the bits - for (counter_trunc = 0; counter_trunc < number_of_trunc_bits; - counter_trunc++) - { - temp = temp_shift_a & 1; - if (temp == 1) - { - error_value += temp_shift_b & ap_mask; - } - temp_shift_a >>= 1; - temp_shift_b <<= 1; - } - - return error_value; -} \ No newline at end of file diff --git a/network/CPP_Cuda/pybind11_auto_pyi.py b/network/CPP_Cuda/pybind11_auto_pyi.py deleted file mode 100644 index 73c8bd2..0000000 --- a/network/CPP_Cuda/pybind11_auto_pyi.py +++ /dev/null @@ -1,23 +0,0 @@ -# %% -# pip install pybind11-stubgen -from pybind11_stubgen import ModuleStubsGenerator # type: ignore -import glob - - -def process(module_name: str) -> None: - module = ModuleStubsGenerator(module_name) - module.parse() - module.write_setup_py = False - - with open(module_name + ".pyi", "w") as fp: - fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n") - fp.write("\n".join(module.to_lines())) - - -Files = glob.glob("*.so") - -for fid in Files: - Idx: int = fid.find(".") - module_name: str = fid[:Idx] - print("Processing: " + module_name) - process(module_name)