diff --git a/network/h_dynamic_cnn_cpu_cpp/HDynamicCNNCPU.cpp b/network/h_dynamic_cnn_cpu_cpp/HDynamicCNNCPU.cpp index 3722929..af30f32 100644 --- a/network/h_dynamic_cnn_cpu_cpp/HDynamicCNNCPU.cpp +++ b/network/h_dynamic_cnn_cpu_cpp/HDynamicCNNCPU.cpp @@ -64,13 +64,20 @@ void HDynamicCNNCPU::entrypoint( size_t h_dim_c1 = h_dim_2 * h_dim_3; size_t h_dim_c2 = h_dim_3; - float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr; - assert((epsilon_xy_pointer != nullptr)); - assert((epsilon_xy_dim_0 > 0)); - assert((epsilon_xy_dim_1 > 0)); + float* epsilon_xy_pointer = nullptr; + size_t epsilon_xy_dim_c0 = 0; + size_t epsilon_xy_dim_c1 = 0; + if (epsilon_xy_pointer_addr != 0) + { + epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr; + assert((epsilon_xy_pointer != nullptr)); + assert((epsilon_xy_dim_0 > 0)); + assert((epsilon_xy_dim_1 > 0)); + assert((epsilon_xy_dim_2 > 0)); - size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1; - size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2; + epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1; + epsilon_xy_dim_c1 = epsilon_xy_dim_2; + } float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr; assert((epsilon_t_pointer != nullptr)); @@ -164,16 +171,18 @@ void HDynamicCNNCPU::update( { float* h_ptr; - float* epsilon_xy_ptr; + float* epsilon_xy_ptr = nullptr; int64_t* input_ptr; for (size_t counter_x = 0; counter_x < dim_x; counter_x++) { for (size_t counter_y = 0; counter_y < dim_y; counter_y++) { - epsilon_xy_ptr = epsilon_xy_pointer + - counter_x * epsilon_xy_dim_c1 + counter_y; - + if (epsilon_xy_dim_c1 != 0) + { + epsilon_xy_ptr = epsilon_xy_pointer + + counter_x * epsilon_xy_dim_c1 + counter_y; + } h_ptr = h_pointer + pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y; @@ -252,8 +261,15 @@ void HDynamicCNNCPU::update_one_ip( if (*spike >= 0) { - epsilon_subsegment = - epsilon_xy_pointer[*spike * epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike]; + if (epsilon_xy_dim_c0 != 0) + { + epsilon_subsegment = + epsilon_xy_pointer[*spike * epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike]; + } + else + { + epsilon_subsegment = epsilon_t_pointer[counter_spike]; + } w_ptr = weights_pointer + *spike * weights_dim_c0; diff --git a/network/h_dynamic_cnn_gpu_cpp_v1/HDynamicCNNGPU.cu b/network/h_dynamic_cnn_gpu_cpp_v1/HDynamicCNNGPU.cu new file mode 100644 index 0000000..0eb402f --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v1/HDynamicCNNGPU.cu @@ -0,0 +1,542 @@ +#include "HDynamicCNNGPU.h" + +#include +#include +#include + +#include +#include +#include + + +HDynamicCNNGPU::HDynamicCNNGPU() +{ + +}; + +HDynamicCNNGPU::~HDynamicCNNGPU() +{ + +}; + +void HDynamicCNNGPU::entrypoint( + int64_t h_pointer_addr, + int64_t h_dim_0, + int64_t h_dim_1, + int64_t h_dim_2, + int64_t h_dim_3, + int64_t epsilon_xy_pointer_addr, + int64_t epsilon_xy_dim_0, + int64_t epsilon_xy_dim_1, + int64_t epsilon_xy_dim_2, + int64_t epsilon_t_pointer_addr, + int64_t epsilon_t_dim_0, + int64_t weights_pointer_addr, + int64_t weights_dim_0, + int64_t weights_dim_1, + int64_t input_pointer_addr, + int64_t input_dim_0, + int64_t input_dim_1, + int64_t input_dim_2, + int64_t input_dim_3, + int64_t init_vector_pointer_addr, + int64_t init_vector_dim_0, + int64_t number_of_processes, + float forgetting_offset, + int64_t gpu_tuning_factor) +{ + + size_t number_of_pattern = input_dim_0; + + size_t h_dim = init_vector_dim_0; + float* h_init_ptr = (float*)init_vector_pointer_addr; + assert((h_init_ptr != nullptr)); + assert((h_dim > 0)); + + float* h_pointer = (float*)h_pointer_addr; + assert((h_pointer != nullptr)); + assert((h_dim_0 > 0)); + assert((h_dim_1 > 0)); + assert((h_dim_2 > 0)); + assert((h_dim_3 > 0)); + + size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3; + size_t h_dim_c1 = h_dim_2 * h_dim_3; + size_t h_dim_c2 = h_dim_3; + + float* epsilon_xy_pointer = nullptr; + size_t epsilon_xy_dim_c0 = 0; + size_t epsilon_xy_dim_c1 = 0; + if (epsilon_xy_pointer_addr != 0) + { + epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr; + assert((epsilon_xy_pointer != nullptr)); + assert((epsilon_xy_dim_0 > 0)); + assert((epsilon_xy_dim_1 > 0)); + assert((epsilon_xy_dim_2 > 0)); + + epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1; + epsilon_xy_dim_c1 = epsilon_xy_dim_2; + } + + float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr; + assert((epsilon_t_pointer != nullptr)); + assert((epsilon_t_dim_0 > 0)); + + float* weights_pointer = (float*)weights_pointer_addr; + assert((weights_pointer != nullptr)); + assert((weights_dim_0 > 0)); + assert((weights_dim_1 > 0)); + + size_t weights_dim_c0 = weights_dim_1; + + int64_t* input_pointer = (int64_t*)input_pointer_addr; + assert((input_pointer != nullptr)); + assert((input_dim_0 > 0)); + assert((input_dim_1 > 0)); + assert((input_dim_2 > 0)); + assert((input_dim_3 > 0)); + + size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3; + size_t input_dim_c1 = input_dim_2 * input_dim_3; + size_t input_dim_c2 = input_dim_3; + + assert((h_dim == weights_dim_1)); + size_t number_of_spikes = input_dim_1; + size_t dim_x = input_dim_2; + size_t dim_y = input_dim_3; + + float forgetting_offset_local = forgetting_offset / static_cast(h_dim); + + // -------------------- + assert((number_of_processes <= 0)); + + gpu_update( + h_init_ptr, + h_pointer, + h_dim_c0, + h_dim_c1, + h_dim_c2, + h_dim, + epsilon_xy_pointer, + epsilon_xy_dim_c0, + epsilon_xy_dim_c1, + epsilon_t_pointer, + weights_pointer, + weights_dim_c0, + input_pointer, + input_dim_c0, + input_dim_c1, + input_dim_c2, + number_of_spikes, + dim_x, + dim_y, + forgetting_offset, + forgetting_offset_local, + number_of_pattern, + gpu_tuning_factor); + + return; +}; + +__device__ void gpu_update_one_ip( + float* __restrict__ h_init_ptr, + float* __restrict__ h_pointer, + size_t h_dim_c1, + size_t h_dim, + float* __restrict__ weights_pointer, + size_t weights_dim_c0, + int64_t* input_pointer, + size_t input_dim_c1, + float* __restrict__ epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, + float* __restrict__ epsilon_t_pointer, + size_t number_of_spikes, + float forgetting_offset, + float forgetting_offset_local, + float* __restrict__ h_temp, + float* __restrict__ h_subsegment +) +{ + + float h_temp_sum; + float temp_value; + + float epsilon_subsegment; + float epsilon_scale = 1.0; + + int64_t* spike; + float* w_ptr; + + // float* h_temp = new float[h_dim]; + // float* h_subsegment = new float[h_dim]; + + // Initialize the sub-segement + for (size_t counter = 0; counter < h_dim; counter++) + { + h_subsegment[counter] = h_init_ptr[counter]; + } + + for (size_t counter_spike = 0; counter_spike < number_of_spikes; counter_spike++) + { + if (epsilon_scale > 1E10) + { + temp_value = 1.0 / epsilon_scale; + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_subsegment[counter] *= temp_value; + } + + epsilon_scale = 1.0; + } + + spike = input_pointer + counter_spike * input_dim_c1; + + if (*spike >= 0) + { + if (epsilon_xy_dim_c0 != 0) + { + epsilon_subsegment = + epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike]; + } + else + { + epsilon_subsegment = epsilon_t_pointer[counter_spike]; + } + w_ptr = weights_pointer + *spike * weights_dim_c0; + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_temp[counter] = h_subsegment[counter] * w_ptr[counter]; + } + + h_temp_sum = 0.0; + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_temp_sum += h_temp[counter]; + } + + if (h_temp_sum > 1E-10) + { + temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum; + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_temp[counter] *= temp_value; + } + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_subsegment[counter] += h_temp[counter]; + } + + if (forgetting_offset_local > 0.0) + { + temp_value = + epsilon_scale * epsilon_subsegment * forgetting_offset_local; + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_subsegment[counter] += temp_value; + } + + epsilon_scale *= + 1.0 + epsilon_subsegment * (1.0 + forgetting_offset); + } + else + { + epsilon_scale *= 1.0 + epsilon_subsegment * 1.0; + } + } + } + } + + temp_value = 1.0 / epsilon_scale; + + for (size_t counter = 0; counter < h_dim; counter++) + { + h_pointer[counter * h_dim_c1] = + h_subsegment[counter] * temp_value; + } + + // delete[] h_temp; + // delete[] h_subsegment; + + return; +}; + +__global__ void kernel_spike_generation( + float* __restrict__ h_init_ptr, + float* __restrict__ h_pointer, + size_t h_dim_c0, + size_t h_dim_c1, + size_t h_dim_c2, + size_t h_dim, + float* __restrict__ weights_pointer, + size_t weights_dim_c0, + int64_t* __restrict__ input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + float* __restrict__ epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, + float* __restrict__ epsilon_t_pointer, + size_t number_of_spikes, + float forgetting_offset, + float forgetting_offset_local, + size_t dim_x, + size_t dim_y, + size_t dim_xy, + size_t max_threadable_tasks, + float* __restrict__ temp_memory_a, + float* __restrict__ temp_memory_b +) +{ + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_threadable_tasks) + { + float* h_ptr; + float* epsilon_xy_ptr = nullptr; + int64_t* input_ptr; + + float* temp_memory_ptr_a = temp_memory_a + idx * h_dim; + float* temp_memory_ptr_b = temp_memory_b + idx * h_dim; + + // int pattern_id = idx; + int pattern_id = idx / dim_xy; + int position_xy = idx - (pattern_id * dim_xy); + + // size_t position_x = blockIdx.y; + // size_t position_y = blockIdx.z; + size_t position_x = position_xy / dim_y; + size_t position_y = position_xy - (position_x * dim_y); + + if (epsilon_xy_dim_c1 != 0) + { + epsilon_xy_ptr = epsilon_xy_pointer + + position_x * epsilon_xy_dim_c1 + position_y; + } + + h_ptr = h_pointer + + pattern_id * h_dim_c0 + position_x * h_dim_c2 + position_y; + + input_ptr = input_pointer + + pattern_id * input_dim_c0 + position_x * input_dim_c2 + position_y; + + gpu_update_one_ip( + h_init_ptr, + h_ptr, + h_dim_c1, + h_dim, + weights_pointer, + weights_dim_c0, + input_ptr, + input_dim_c1, + epsilon_xy_ptr, + epsilon_xy_dim_c0, + epsilon_t_pointer, + number_of_spikes, + forgetting_offset, + forgetting_offset_local, + temp_memory_ptr_a, + temp_memory_ptr_b + ); + + } + +}; + +// Let's face it... We need a better way to paralelize it... +void HDynamicCNNGPU::gpu_update( + float* h_init_ptr, + float* h_pointer, + size_t h_dim_c0, + size_t h_dim_c1, + size_t h_dim_c2, + size_t h_dim, + float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, + float* epsilon_t_pointer, + float* weights_pointer, + size_t weights_dim_c0, + int64_t* input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + size_t number_of_spikes, + size_t dim_x, + size_t dim_y, + float forgetting_offset, + float forgetting_offset_local, + size_t number_of_pattern, + size_t gpu_tuning_factor) +{ + + cudaError_t status; + assert((dim_x < 65535)); + assert((dim_y < 65535)); + + // ////////////////////////////////////// + // Calculate the distribution on the GPU + // ////////////////////////////////////// + + int min_grid_size; + int block_size; + int grid_size; + + size_t dynamic_s_mem_size = 0; + size_t max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator + status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, + (void*)kernel_spike_generation, + dynamic_s_mem_size, max_threadable_tasks); + if (status != cudaSuccess) + { + std::cerr << "CUDA Runtime Error at: " + << __FILE__ + << ":" + << __LINE__ + << std::endl; + std::cerr << cudaGetErrorString(status) << std::endl; + } + assert((status == cudaSuccess)); + + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications + // Maximum dimensionality of grid of thread blocks: 3 + // Maximum x -dimension of a grid of thread blocks: (2^31)-1 + // Maximum y- or z-dimension of a grid of thread blocks: 65535 + + // Reduce the automatic block size with our guess + if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < block_size)) + { + block_size = int(gpu_tuning_factor); + } + // Round up according to array size + // (I will separate x and y into other grid dimentsions soon) + // grid_size = (number_of_pattern + block_size - 1) / block_size; + grid_size = (max_threadable_tasks + block_size - 1) / block_size; + + float* temp_memory_a = nullptr; + status = cudaMalloc((void**)&temp_memory_a, h_dim * max_threadable_tasks * sizeof(float)); + if (status != cudaSuccess) + { + std::cerr << "CUDA Runtime Error at: " + << __FILE__ + << ":" + << __LINE__ + << std::endl; + std::cerr << cudaGetErrorString(status) << std::endl; + } + assert((status == cudaSuccess)); + + float* temp_memory_b = nullptr; + status = cudaMalloc((void**)&temp_memory_b, h_dim * max_threadable_tasks * sizeof(float)); + if (status != cudaSuccess) + { + std::cerr << "CUDA Runtime Error at: " + << __FILE__ + << ":" + << __LINE__ + << std::endl; + std::cerr << cudaGetErrorString(status) << std::endl; + } + assert((status == cudaSuccess)); + + + //kernel_spike_generation<<>>( + kernel_spike_generation<<>>( + h_init_ptr, + h_pointer, + h_dim_c0, + h_dim_c1, + h_dim_c2, + h_dim, + weights_pointer, + weights_dim_c0, + input_pointer, + input_dim_c0, + input_dim_c1, + input_dim_c2, + epsilon_xy_pointer, + epsilon_xy_dim_c0, + epsilon_xy_dim_c1, + epsilon_t_pointer, + number_of_spikes, + forgetting_offset, + forgetting_offset_local, + dim_x, + dim_y, + (dim_x * dim_y), + //number_of_pattern + max_threadable_tasks, + temp_memory_a, + temp_memory_b + ); + + status = cudaDeviceSynchronize(); + if (status != cudaSuccess) + { + std::cerr << "CUDA Runtime Error at: " + << __FILE__ + << ":" + << __LINE__ + << std::endl; + std::cerr << cudaGetErrorString(status) << std::endl; + } + assert((status == cudaSuccess)); + + status = cudaFree(temp_memory_a); + if (status != cudaSuccess) + { + std::cerr << "CUDA Runtime Error at: " + << __FILE__ + << ":" + << __LINE__ + << std::endl; + std::cerr << cudaGetErrorString(status) << std::endl; + } + assert((status == cudaSuccess)); + + status = cudaFree(temp_memory_b); + if (status != cudaSuccess) + { + std::cerr << "CUDA Runtime Error at: " + << __FILE__ + << ":" + << __LINE__ + << std::endl; + std::cerr << cudaGetErrorString(status) << std::endl; + } + assert((status == cudaSuccess)); + + return; +}; + + +void HDynamicCNNGPU::gpu_occupancy_export( + size_t dim_x, + size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) +{ + return; +}; + +void HDynamicCNNGPU::gpu_occupancy_import( + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1 +) +{ + return; +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v1/HDynamicCNNGPU.h b/network/h_dynamic_cnn_gpu_cpp_v1/HDynamicCNNGPU.h new file mode 100644 index 0000000..235bad4 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v1/HDynamicCNNGPU.h @@ -0,0 +1,84 @@ +#ifndef HDYNAMICCNNGPU +#define HDYNAMICCNNGPU + +#include + +#include +#include + +class HDynamicCNNGPU +{ +public: + HDynamicCNNGPU(); + ~HDynamicCNNGPU(); + + void entrypoint( + int64_t h_pointer_addr, + int64_t h_dim_0, + int64_t h_dim_1, + int64_t h_dim_2, + int64_t h_dim_3, + int64_t epsilon_xy_pointer_addr, + int64_t epsilon_xy_dim_0, + int64_t epsilon_xy_dim_1, + int64_t epsilon_xy_dim_2, + int64_t epsilon_t_pointer_addr, + int64_t epsilon_t_dim_0, + int64_t weights_pointer_addr, + int64_t weights_dim_0, + int64_t weights_dim_1, + int64_t input_pointer_addr, + int64_t input_dim_0, + int64_t input_dim_1, + int64_t input_dim_2, + int64_t input_dim_3, + int64_t init_vector_pointer_addr, + int64_t init_vector_dim_0, + int64_t number_of_processes, + float forgetting_offset, + int64_t gpu_tuning_factor); + + void gpu_occupancy_export( + size_t dim_x, + size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1); + + void gpu_occupancy_import( + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1); + +private: + + void gpu_update( + float* h_init_ptr, + float* h_pointer, + size_t h_dim_c0, + size_t h_dim_c1, + size_t h_dim_c2, + size_t h_dim, + float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, + float* epsilon_t_pointer, + float* weights_pointer, + size_t weights_dim_c0, + int64_t* input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + size_t number_of_spikes, + size_t dim_x, + size_t dim_y, + float forgetting_offset, + float forgetting_offset_local, + size_t number_of_pattern, + size_t gpu_tuning_factor); + +}; + +#endif /* HDYNAMICCNNGPU */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v1/Makefile b/network/h_dynamic_cnn_gpu_cpp_v1/Makefile new file mode 100644 index 0000000..acdc799 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v1/Makefile @@ -0,0 +1,33 @@ +include ../.env +export + +name = HDynamicCNN +type = GPU + +PYPOSTFIX := $(shell $(PYBIN)python3-config --extension-suffix) +PYBIND11INCLUDE := $(shell $(PYBIN)python3 -m pybind11 --includes) +PARAMETERS_O = $(PARAMETERS_O_GPU) $(PYBIND11INCLUDE) +PARAMETERS_Linker = $(PARAMETERS_Linker_GPU) + +so_file = Py$(name)$(type)$(PYPOSTFIX) +pyi_file = Py$(name)$(type).pyi +all: ../$(so_file) + +$(O_DIRS)$(name)$(type).o: $(name)$(type).h $(name)$(type).cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c $(name)$(type).cu -o $(O_DIRS)$(name)$(type).o + +$(O_DIRS)Py$(name)$(type).o: $(name)$(type).h Py$(name)$(type).cpp + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c Py$(name)$(type).cpp -o $(O_DIRS)Py$(name)$(type).o + +../$(so_file): $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o + $(NVCC) $(PARAMETERS_Linker) -o ../$(so_file) $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o + + +####################### +clean: + rm -rf $(O_DIRS) + rm -f ../$(so_file) + rm -f ../$(pyi_file) + diff --git a/network/h_dynamic_cnn_gpu_cpp_v1/PyHDynamicCNNGPU.cpp b/network/h_dynamic_cnn_gpu_cpp_v1/PyHDynamicCNNGPU.cpp new file mode 100644 index 0000000..999b6a6 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v1/PyHDynamicCNNGPU.cpp @@ -0,0 +1,18 @@ +#include + +#include "HDynamicCNNGPU.h" + +namespace py = pybind11; + +PYBIND11_MODULE(PyHDynamicCNNGPU, m) +{ + m.doc() = "HDynamicCNNGPU Module"; + py::class_(m, "HDynamicCNNGPU") + .def(py::init<>()) + .def("gpu_occupancy_export", + &HDynamicCNNGPU::gpu_occupancy_export) + .def("gpu_occupancy_import", + &HDynamicCNNGPU::gpu_occupancy_import) + .def("update", + &HDynamicCNNGPU::entrypoint); +} \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/HDynamicCNNGPU.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/HDynamicCNNGPU.cu new file mode 100644 index 0000000..c344f1e --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/HDynamicCNNGPU.cu @@ -0,0 +1,727 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include "HDynamicCNNGPU.h" +#include "kernel_phxy_fill_with_h.h" +#include "kernel_phxy_fill_with_spike_selected_w.h" +#include "kernel_phxy_one_over_sum_into_pxy.h" +#include "kernel_phxy_plus_phxy.h" +#include "kernel_phxy_plus_pxy.h" +#include "kernel_phxy_times_phxy_equals_phxy.h" +#include "kernel_phxy_times_pxy.h" +#include "kernel_pxy_plus_v.h" +#include "kernel_pxy_reciprocal.h" +#include "kernel_pxy_set_to_v.h" +#include "kernel_pxy_time_pxy.h" +#include "kernel_pxy_times_spike_selected_sxy.h" +#include "kernel_pxy_times_v.h" + +HDynamicCNNGPU::HDynamicCNNGPU() +{ + +}; + +HDynamicCNNGPU::~HDynamicCNNGPU() +{ + +}; + +void HDynamicCNNGPU::entrypoint( + int64_t h_pointer_addr, + int64_t h_dim_0, + int64_t h_dim_1, + int64_t h_dim_2, + int64_t h_dim_3, + int64_t epsilon_xy_pointer_addr, + int64_t epsilon_xy_dim_0, + int64_t epsilon_xy_dim_1, + int64_t epsilon_xy_dim_2, + int64_t epsilon_t_pointer_addr, + int64_t epsilon_t_dim_0, + int64_t weights_pointer_addr, + int64_t weights_dim_0, + int64_t weights_dim_1, + int64_t input_pointer_addr, + int64_t input_dim_0, + int64_t input_dim_1, + int64_t input_dim_2, + int64_t input_dim_3, + int64_t init_vector_pointer_addr, + int64_t init_vector_dim_0, + int64_t number_of_processes, + float forgetting_offset, + int64_t gpu_tuning_factor +) +{ + std::cout << "Hello\n"; + + size_t number_of_pattern = input_dim_0; + + size_t h_dim = init_vector_dim_0; + float* h_init_ptr = (float*)init_vector_pointer_addr; + assert((h_init_ptr != nullptr)); + assert((h_dim > 0)); + + float* h_pointer = (float*)h_pointer_addr; + assert((h_pointer != nullptr)); + assert((h_dim_0 > 0)); + assert((h_dim_1 > 0)); + assert((h_dim_2 > 0)); + assert((h_dim_3 > 0)); + + size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3; + size_t h_dim_c1 = h_dim_2 * h_dim_3; + size_t h_dim_c2 = h_dim_3; + + float* epsilon_xy_pointer = nullptr; + size_t epsilon_xy_dim_c0 = 0; + size_t epsilon_xy_dim_c1 = 0; + if (epsilon_xy_pointer_addr != 0) + { + epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr; + assert((epsilon_xy_pointer != nullptr)); + assert((epsilon_xy_dim_0 > 0)); + assert((epsilon_xy_dim_1 > 0)); + assert((epsilon_xy_dim_2 > 0)); + + epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1; + epsilon_xy_dim_c1 = epsilon_xy_dim_2; + } + + float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr; + assert((epsilon_t_pointer != nullptr)); + assert((epsilon_t_dim_0 > 0)); + + float* weights_pointer = (float*)weights_pointer_addr; + assert((weights_pointer != nullptr)); + assert((weights_dim_0 > 0)); + assert((weights_dim_1 > 0)); + + size_t weights_dim_c0 = weights_dim_1; + + int64_t* input_pointer = (int64_t*)input_pointer_addr; + assert((input_pointer != nullptr)); + assert((input_dim_0 > 0)); + assert((input_dim_1 > 0)); + assert((input_dim_2 > 0)); + assert((input_dim_3 > 0)); + + size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3; + size_t input_dim_c1 = input_dim_2 * input_dim_3; + size_t input_dim_c2 = input_dim_3; + + assert((h_dim == weights_dim_1)); + size_t number_of_spikes = input_dim_1; + size_t dim_x = input_dim_2; + size_t dim_y = input_dim_3; + + float forgetting_offset_local = forgetting_offset / static_cast(h_dim); + + // -------------------- + assert((number_of_processes <= 0)); + gpu_update(h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + epsilon_xy_pointer, epsilon_xy_dim_c0, epsilon_xy_dim_c1, + epsilon_t_pointer, weights_pointer, weights_dim_c0, + input_pointer, input_dim_c0, input_dim_c1, input_dim_c2, + number_of_spikes, dim_x, dim_y, forgetting_offset, + forgetting_offset_local, number_of_pattern, gpu_tuning_factor); + return; +}; + + +void HDynamicCNNGPU::gpu_occupancy_measure( + size_t dim_x, + size_t dim_y, + size_t number_of_pattern, + size_t h_dim) +{ + grid_and_thread_calculated = false; + assert((dim_x < 65535)); + assert((dim_y < 65535)); + + grid_and_thread_settings.resize(14); + + occupancy_kernel_phxy_plus_phxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY], display_debug); + + occupancy_kernel_pxy_plus_v(dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V], + display_debug); + + occupancy_kernel_pxy_times_v(dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V], + display_debug); + + occupancy_kernel_phxy_fill_with_h( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H], display_debug); + + occupancy_kernel_phxy_plus_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY], display_debug); + + occupancy_kernel_pxy_reciprocal( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug); + + occupancy_kernel_phxy_fill_with_spike_selected_w( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W], + display_debug); + + occupancy_kernel_phxy_times_phxy_equals_phxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY], + display_debug); + + occupancy_kernel_pxy_set_to_v( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V], display_debug); + + occupancy_kernel_phxy_one_over_sum_into_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY], + display_debug); + + occupancy_kernel_phxy_times_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY], display_debug); + + occupancy_kernel_pxy_time_pxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY], display_debug); + + // occupancy_kernel_approximation_pure_multiplication( + // dim_x, dim_y, number_of_pattern, h_dim, + // grid_and_thread_settings[ID_KERNEL_APPROXIMATION_MULTIPLICATION], + // display_debug); + + occupancy_kernel_pxy_times_spike_selected_sxy( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY], + display_debug); + + grid_and_thread_calculated = true; + return; +}; + +void HDynamicCNNGPU::gpu_occupancy_export( + size_t dim_x, + size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) +{ + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS)); + + gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim); + assert((grid_and_thread_calculated == true)); + + assert((setting_dim_0 == grid_and_thread_settings.size())); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) + { + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) + { + setting_memory[counter_0 * setting_dim_1 + counter_1] = + grid_and_thread_settings[counter_0][counter_1]; + } + } +}; + +void HDynamicCNNGPU::gpu_occupancy_import( + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1) +{ + grid_and_thread_calculated = false; + + int64_t* setting_memory = (int64_t*)setting_memory_addr; + + assert((setting_memory != nullptr)); + assert((setting_dim_1 == H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS)); + assert((setting_dim_0 == H_DYNAMIC_NUMBER_OF_KERNELS)); + + grid_and_thread_settings.resize(H_DYNAMIC_NUMBER_OF_KERNELS); + + for (size_t counter_0 = 0; counter_0 < setting_dim_0; counter_0++) + { + grid_and_thread_settings[counter_0].resize( + H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS); + + for (size_t counter_1 = 0; counter_1 < setting_dim_1; counter_1++) + { + grid_and_thread_settings[counter_0][counter_1] = + setting_memory[counter_0 * setting_dim_1 + counter_1]; + } + } + + grid_and_thread_calculated = true; +}; + +void HDynamicCNNGPU::gpu_update( + float* h_init_ptr, + float* h_pointer, + size_t h_dim_c0, + size_t h_dim_c1, + size_t h_dim_c2, + size_t h_dim, + float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, + float* epsilon_t_pointer, + float* weights_pointer, + size_t weights_dim_c0, + int64_t* input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + size_t number_of_spikes, + size_t dim_x, + size_t dim_y, + float forgetting_offset, + float forgetting_offset_local, + size_t number_of_pattern, + size_t gpu_tuning_factor) +{ + std::cout << "0\n"; + if (grid_and_thread_calculated == false) + { + gpu_occupancy_measure(dim_x, dim_y, number_of_pattern, h_dim); + } + assert((grid_and_thread_calculated == true)); + + cudaError_t status; + + size_t h_sum_dim_c0 = dim_x * dim_y; + size_t h_sum_dim_c1 = dim_y; + + size_t phxy_block_dim_c0 = h_dim * dim_x * dim_y; + size_t phxy_block_dim_c1 = dim_x * dim_y; + size_t phxy_block_dim_c2 = dim_y; + + size_t pxy_block_dim_c0 = dim_x * dim_y; + size_t pxy_block_dim_c1 = dim_y; + + std::cout << "1\n"; + float* w_memory = nullptr; + status = cudaMalloc((void**)&w_memory, number_of_pattern * h_dim * dim_x * + dim_y * sizeof(float)); + assert((status == cudaSuccess)); + std::cout << "2\n"; + float* h_temp_memory = nullptr; + status = + cudaMalloc((void**)&h_temp_memory, + number_of_pattern * h_dim * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + std::cout << "3\n"; + float* h_sum_memory = nullptr; + status = cudaMalloc((void**)&h_sum_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + std::cout << "4\n"; + float* epsilon_subsegment_memory = nullptr; + status = cudaMalloc((void**)&epsilon_subsegment_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + std::cout << "5\n"; + float* epsilon_scale_memory = nullptr; + status = cudaMalloc((void**)&epsilon_scale_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + + + std::cout << "6\n"; + float* forget_memory = nullptr; + if (forgetting_offset > 0.0) + { + status = cudaMalloc((void**)&forget_memory, + number_of_pattern * dim_x * dim_y * sizeof(float)); + assert((status == cudaSuccess)); + } + // --- + std::cout << "A\n"; + // Initialize h + kernel_phxy_fill_with_h<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][0], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][1], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][3], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][4], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][5])>>>( + h_init_ptr, h_pointer, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_H][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "B\n"; + // Set epsilon memory scale to 1.0 + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + epsilon_scale_memory, 1.0, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "C\n"; + for (size_t counter_spike = 0; counter_spike < number_of_spikes; + counter_spike++) + { + // Get epsilon_t from gpu memory + float epsilon_t; + status = cudaMemcpy(&epsilon_t, &epsilon_t_pointer[counter_spike], + sizeof(float), cudaMemcpyDeviceToHost); + assert((status == cudaSuccess)); + // Set epsilon memory subsegment to epsilon(t) + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + epsilon_subsegment_memory, epsilon_t, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "D\n"; + if (forget_memory != nullptr) + { + // Set forget memory subsegment to forgetting_offset_local + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + forget_memory, forgetting_offset_local, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + } + + std::cout << "E\n"; + // if (*spike >= 0) { + // epsilon_subsegment = *epsilon_xy_pointer[*spike * + // epsilon_xy_dim_c0] + if (epsilon_xy_dim_c0 != 0) + { + kernel_pxy_times_spike_selected_sxy<<< + dim3( + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY] + [2]), + dim3( + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY] + [5])>>>( + epsilon_subsegment_memory, epsilon_xy_pointer, input_pointer, + counter_spike, input_dim_c0, input_dim_c1, input_dim_c2, + epsilon_xy_dim_c0, epsilon_xy_dim_c1, epsilon_xy_dim_c0, + epsilon_xy_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + } + std::cout << "F\n"; + // Get the weight vectors according the spikes + kernel_phxy_fill_with_spike_selected_w<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [0], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [1], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [3], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [4], + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W] + [5])>>>( + w_memory, weights_pointer, input_pointer, counter_spike, weights_dim_c0, + input_dim_c0, input_dim_c1, input_dim_c2, h_dim_c0, h_dim_c1, h_dim_c2, + h_dim, phxy_block_dim_c0, phxy_block_dim_c1, phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "G\n"; + // h_temp = h * w + kernel_phxy_times_phxy_equals_phxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [0], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [1], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [3], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [4], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY] + [5])>>>( + h_pointer, w_memory, h_temp_memory, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "H\n"; + // 1 / sum h_temp + kernel_phxy_one_over_sum_into_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY] + [5])>>>( + h_temp_memory, h_sum_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, pxy_block_dim_c0, pxy_block_dim_c1, + grid_and_thread_settings[ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "I\n"; + // epsilon_scale / sum h_temp + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + h_sum_memory, epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "J\n"; + // epsilon_subsegment * epsilon_scale / sum h_temp + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + h_sum_memory, epsilon_subsegment_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "K\n"; + // epsilon_scale * forget_memory which contains forgetting_offset_local + if (forget_memory != nullptr) + { + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + forget_memory, epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + } + + std::cout << "L\n"; + // delta_forget = epsilon_subsegment * epsilon_scale * forget_memory + if (forget_memory != nullptr) + { + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + forget_memory, epsilon_subsegment_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + } + std::cout << "M\n"; + // delta_h = h_temp_memory * epsilon_subsegment * epsilon_scale / sum h + kernel_phxy_times_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][5])>>>( + h_temp_memory, h_sum_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "N\n"; + // h + delta_h + kernel_phxy_plus_phxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][5])>>>( + h_pointer, h_temp_memory, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PHXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "O\n"; + // h + delta_h + delta_forget + kernel_phxy_plus_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][5])>>>( + h_pointer, forget_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_PLUS_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "P\n"; + kernel_pxy_times_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][5])>>>( + epsilon_subsegment_memory, (1.0 + forgetting_offset), + grid_and_thread_settings[ID_KERNEL_PXY_TIMES_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "Q\n"; + kernel_pxy_plus_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][5])>>>( + epsilon_subsegment_memory, 1.0, + grid_and_thread_settings[ID_KERNEL_PXY_PLUS_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "R\n"; + // epsilon_scale * epsilon_subsegment + kernel_pxy_time_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][0], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][1], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][3], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][4], + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][5])>>>( + epsilon_scale_memory, epsilon_subsegment_memory, + grid_and_thread_settings[ID_KERNEL_PXY_TIME_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + if (((counter_spike > 0) && (counter_spike % 5000 == 0)) || + (counter_spike + 1 == number_of_spikes)) + { + std::cout << "S\n"; + kernel_pxy_reciprocal<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>( + epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "T\n"; + kernel_phxy_times_pxy<<< + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][0], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][1], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][3], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][4], + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][5])>>>( + h_pointer, epsilon_scale_memory, h_dim_c0, h_dim_c1, h_dim_c2, h_dim, + h_sum_dim_c0, h_sum_dim_c1, phxy_block_dim_c0, phxy_block_dim_c1, + phxy_block_dim_c2, + grid_and_thread_settings[ID_KERNEL_PHXY_TIMES_PXY][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + + std::cout << "U\n"; + // Set epsilon memory scale to 1.0 + kernel_pxy_set_to_v<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][0], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][1], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][3], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][4], + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][5])>>>( + epsilon_scale_memory, 1.0, + grid_and_thread_settings[ID_KERNEL_PXY_SET_TO_V][6]); + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); + } + } + std::cout << "V\n"; + // ------------ + + status = cudaFree(w_memory); + assert((status == cudaSuccess)); + + status = cudaFree(h_temp_memory); + assert((status == cudaSuccess)); + + status = cudaFree(h_sum_memory); + assert((status == cudaSuccess)); + + status = cudaFree(epsilon_subsegment_memory); + assert((status == cudaSuccess)); + + status = cudaFree(epsilon_scale_memory); + assert((status == cudaSuccess)); + + if (forget_memory != nullptr) + { + status = cudaFree(forget_memory); + assert((status == cudaSuccess)); + } + + return; +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/HDynamicCNNGPU.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/HDynamicCNNGPU.h new file mode 100644 index 0000000..2c758ea --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/HDynamicCNNGPU.h @@ -0,0 +1,113 @@ +#ifndef HDYNAMICCNNGPU +#define HDYNAMICCNNGPU + +#include +#include + +#include +#include +#include + +#define ID_KERNEL_PHXY_PLUS_PHXY 0 +#define ID_KERNEL_PXY_PLUS_V 1 +#define ID_KERNEL_PXY_TIMES_V 2 +#define ID_KERNEL_PHXY_FILL_WITH_H 3 +#define ID_KERNEL_PHXY_PLUS_PXY 4 +#define ID_KERNEL_PXY_RECIPROCAL 5 +#define ID_KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W 6 +#define ID_KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY 7 +#define ID_KERNEL_PXY_SET_TO_V 8 +#define ID_KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY 9 +#define ID_KERNEL_PHXY_TIMES_PXY 10 +#define ID_KERNEL_PXY_TIME_PXY 11 +#define ID_KERNEL_APPROXIMATION_MULTIPLICATION 12 +#define ID_KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY 13 + +#define H_DYNAMIC_NUMBER_OF_KERNELS 14 +#define H_DYNAMIC_NUMBER_OF_KERNELS_PARAMETERS 7 + +class HDynamicCNNGPU +{ + public: + HDynamicCNNGPU(); + ~HDynamicCNNGPU(); + + void entrypoint( + int64_t h_pointer_addr, + int64_t h_dim_0, + int64_t h_dim_1, + int64_t h_dim_2, + int64_t h_dim_3, + int64_t epsilon_xy_pointer_addr, + int64_t epsilon_xy_dim_0, + int64_t epsilon_xy_dim_1, + int64_t epsilon_xy_dim_2, + int64_t epsilon_t_pointer_addr, + int64_t epsilon_t_dim_0, + int64_t weights_pointer_addr, + int64_t weights_dim_0, + int64_t weights_dim_1, + int64_t input_pointer_addr, + int64_t input_dim_0, + int64_t input_dim_1, + int64_t input_dim_2, + int64_t input_dim_3, + int64_t init_vector_pointer_addr, + int64_t init_vector_dim_0, + int64_t number_of_processes, + float forgetting_offset, + int64_t gpu_tuning_factor + ); + + void gpu_occupancy_export( + size_t dim_x, + size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1); + + void gpu_occupancy_import( + int64_t setting_memory_addr, + size_t setting_dim_0, + size_t setting_dim_1); + + private: + + void gpu_update( + float* h_init_ptr, + float* h_pointer, + size_t h_dim_c0, + size_t h_dim_c1, + size_t h_dim_c2, + size_t h_dim, + float* epsilon_xy_pointer, + size_t epsilon_xy_dim_c0, + size_t epsilon_xy_dim_c1, + float* epsilon_t_pointer, + float* weights_pointer, + size_t weights_dim_c0, + int64_t* input_pointer, + size_t input_dim_c0, + size_t input_dim_c1, + size_t input_dim_c2, + size_t number_of_spikes, + size_t dim_x, size_t dim_y, + float forgetting_offset, + float forgetting_offset_local, + size_t number_of_pattern, + size_t gpu_tuning_factor); + + void gpu_occupancy_measure( + size_t dim_x, + size_t dim_y, + size_t number_of_pattern, + size_t h_dim); + + bool grid_and_thread_calculated = false; + std::vector> grid_and_thread_settings; + bool display_debug = false; +}; + +#endif /* HDYNAMICCNNGPU */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/Makefile b/network/h_dynamic_cnn_gpu_cpp_v2_pre/Makefile new file mode 100644 index 0000000..11e1771 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/Makefile @@ -0,0 +1,153 @@ +include ../.env +export + +name = HDynamicCNN +type = GPU + +PYPOSTFIX := $(shell $(PYBIN)python3-config --extension-suffix) +PYBIND11INCLUDE := $(shell $(PYBIN)python3 -m pybind11 --includes) +PARAMETERS_O = $(PARAMETERS_O_GPU) $(PYBIND11INCLUDE) +PARAMETERS_Linker = $(PARAMETERS_Linker_GPU) + +so_file = Py$(name)$(type)$(PYPOSTFIX) +pyi_file = Py$(name)$(type).pyi +all: ../$(so_file) + + +$(O_DIRS)$(name)$(type).o: \ + $(name)$(type).h \ + $(name)$(type).cu \ + kernel_helper_functions.h \ + kernel_phxy_plus_pxy.h \ + kernel_pxy_set_to_v.h \ + kernel_phxy_fill_with_h.h \ + kernel_phxy_times_phxy_equals_phxy.h \ + kernel_pxy_time_pxy.h \ + kernel_phxy_fill_with_spike_selected_w.h \ + kernel_phxy_times_pxy.h \ + kernel_pxy_times_spike_selected_sxy.h \ + kernel_phxy_one_over_sum_into_pxy.h \ + kernel_pxy_plus_v.h \ + kernel_pxy_times_v.h \ + kernel_phxy_plus_phxy.h \ + kernel_pxy_reciprocal.h + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c $(name)$(type).cu -o $(O_DIRS)$(name)$(type).o + +$(O_DIRS)Py$(name)$(type).o: $(name)$(type).h Py$(name)$(type).cpp + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c Py$(name)$(type).cpp -o $(O_DIRS)Py$(name)$(type).o + +../$(so_file): \ + $(O_DIRS)$(name)$(type).o \ + $(O_DIRS)Py$(name)$(type).o \ + $(O_DIRS)kernel_helper_functions.o \ + $(O_DIRS)kernel_phxy_plus_pxy.o \ + $(O_DIRS)kernel_pxy_set_to_v.o \ + $(O_DIRS)kernel_phxy_fill_with_h.o \ + $(O_DIRS)kernel_phxy_times_phxy_equals_phxy.o \ + $(O_DIRS)kernel_pxy_time_pxy.o \ + $(O_DIRS)kernel_phxy_fill_with_spike_selected_w.o \ + $(O_DIRS)kernel_phxy_times_pxy.o \ + $(O_DIRS)kernel_pxy_times_spike_selected_sxy.o \ + $(O_DIRS)kernel_phxy_one_over_sum_into_pxy.o \ + $(O_DIRS)kernel_pxy_plus_v.o \ + $(O_DIRS)kernel_pxy_times_v.o \ + $(O_DIRS)kernel_phxy_plus_phxy.o \ + $(O_DIRS)kernel_pxy_reciprocal.o + $(NVCC) $(PARAMETERS_Linker) -o ../$(so_file) \ + $(O_DIRS)$(name)$(type).o \ + $(O_DIRS)Py$(name)$(type).o \ + $(O_DIRS)kernel_helper_functions.o \ + $(O_DIRS)kernel_phxy_plus_pxy.o \ + $(O_DIRS)kernel_pxy_set_to_v.o \ + $(O_DIRS)kernel_phxy_fill_with_h.o \ + $(O_DIRS)kernel_phxy_times_phxy_equals_phxy.o \ + $(O_DIRS)kernel_pxy_time_pxy.o \ + $(O_DIRS)kernel_phxy_fill_with_spike_selected_w.o \ + $(O_DIRS)kernel_phxy_times_pxy.o \ + $(O_DIRS)kernel_pxy_times_spike_selected_sxy.o \ + $(O_DIRS)kernel_phxy_one_over_sum_into_pxy.o \ + $(O_DIRS)kernel_pxy_plus_v.o \ + $(O_DIRS)kernel_pxy_times_v.o \ + $(O_DIRS)kernel_phxy_plus_phxy.o \ + $(O_DIRS)kernel_pxy_reciprocal.o + + +$(O_DIRS)kernel_helper_functions.o: kernel_helper_functions.h kernel_helper_functions.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_helper_functions.cu -o $(O_DIRS)kernel_helper_functions.o + +$(O_DIRS)kernel_phxy_plus_pxy.o: kernel_helper_functions.h \ + kernel_phxy_plus_pxy.h kernel_phxy_plus_pxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_plus_pxy.cu -o $(O_DIRS)kernel_phxy_plus_pxy.o + +$(O_DIRS)kernel_pxy_set_to_v.o: kernel_helper_functions.h \ + kernel_pxy_set_to_v.h kernel_pxy_set_to_v.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_set_to_v.cu -o $(O_DIRS)kernel_pxy_set_to_v.o + +$(O_DIRS)kernel_phxy_fill_with_h.o: kernel_helper_functions.h \ + kernel_phxy_fill_with_h.h kernel_phxy_fill_with_h.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_fill_with_h.cu -o $(O_DIRS)kernel_phxy_fill_with_h.o + +$(O_DIRS)kernel_phxy_times_phxy_equals_phxy.o: kernel_helper_functions.h \ + kernel_phxy_times_phxy_equals_phxy.h kernel_phxy_times_phxy_equals_phxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_times_phxy_equals_phxy.cu -o $(O_DIRS)kernel_phxy_times_phxy_equals_phxy.o + +$(O_DIRS)kernel_pxy_time_pxy.o: kernel_helper_functions.h \ + kernel_pxy_time_pxy.h kernel_pxy_time_pxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_time_pxy.cu -o $(O_DIRS)kernel_pxy_time_pxy.o + +$(O_DIRS)kernel_phxy_fill_with_spike_selected_w.o: kernel_helper_functions.h \ + kernel_phxy_fill_with_spike_selected_w.h kernel_phxy_fill_with_spike_selected_w.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_fill_with_spike_selected_w.cu -o $(O_DIRS)kernel_phxy_fill_with_spike_selected_w.o + +$(O_DIRS)kernel_phxy_times_pxy.o: kernel_helper_functions.h \ + kernel_phxy_times_pxy.h kernel_phxy_times_pxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_times_pxy.cu -o $(O_DIRS)kernel_phxy_times_pxy.o + +$(O_DIRS)kernel_pxy_times_spike_selected_sxy.o: kernel_helper_functions.h \ + kernel_pxy_times_spike_selected_sxy.h kernel_pxy_times_spike_selected_sxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_times_spike_selected_sxy.cu -o $(O_DIRS)kernel_pxy_times_spike_selected_sxy.o + +$(O_DIRS)kernel_phxy_one_over_sum_into_pxy.o: kernel_helper_functions.h \ + kernel_phxy_one_over_sum_into_pxy.h kernel_phxy_one_over_sum_into_pxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_one_over_sum_into_pxy.cu -o $(O_DIRS)kernel_phxy_one_over_sum_into_pxy.o + +$(O_DIRS)kernel_pxy_plus_v.o: kernel_helper_functions.h \ + kernel_pxy_plus_v.h kernel_pxy_plus_v.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_plus_v.cu -o $(O_DIRS)kernel_pxy_plus_v.o + +$(O_DIRS)kernel_pxy_times_v.o: kernel_helper_functions.h \ + kernel_pxy_times_v.h kernel_pxy_times_v.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_times_v.cu -o $(O_DIRS)kernel_pxy_times_v.o + +$(O_DIRS)kernel_phxy_plus_phxy.o: kernel_helper_functions.h \ + kernel_phxy_plus_phxy.h kernel_phxy_plus_phxy.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_phxy_plus_phxy.cu -o $(O_DIRS)kernel_phxy_plus_phxy.o + +$(O_DIRS)kernel_pxy_reciprocal.o: kernel_helper_functions.h \ + kernel_pxy_reciprocal.h kernel_pxy_reciprocal.cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c kernel_pxy_reciprocal.cu -o $(O_DIRS)kernel_pxy_reciprocal.o + + + +####################### +clean: + rm -rf $(O_DIRS) + rm -f ../$(so_file) + rm -f ../$(pyi_file) + diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/PyHDynamicCNNGPU.cpp b/network/h_dynamic_cnn_gpu_cpp_v2_pre/PyHDynamicCNNGPU.cpp new file mode 100644 index 0000000..d6676a2 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/PyHDynamicCNNGPU.cpp @@ -0,0 +1,18 @@ +#include + +#include "HDynamicCNNGPU.h" + +namespace py = pybind11; + +PYBIND11_MODULE(PyHDynamicCNNGPU, m) +{ + m.doc() = "HDynamicCNNManyIP Module"; + py::class_(m, "HDynamicCNNGPU") + .def(py::init<>()) + .def("gpu_occupancy_export", + &HDynamicCNNGPU::gpu_occupancy_export) + .def("gpu_occupancy_import", + &HDynamicCNNGPU::gpu_occupancy_import) + .def("update", + &HDynamicCNNGPU::entrypoint); +} \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_helper_functions.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_helper_functions.cu new file mode 100644 index 0000000..bd023e9 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_helper_functions.cu @@ -0,0 +1,17 @@ +#include + +#include "kernel_helper_functions.h" + +void kernel_debug_plot(std::vector output, bool display_debug) { + if (display_debug == true) { + std::cout << "grid x: " << output[0] << std::endl; + std::cout << "grid y: " << output[1] << std::endl; + std::cout << "grid z: " << output[2] << std::endl; + std::cout << "thread block x: " << output[3] << std::endl; + std::cout << "thread block y: " << output[4] << std::endl; + std::cout << "thread block z: " << output[5] << std::endl; + std::cout << "max_idx: " << output[6] << std::endl << std::endl; + } + + return; +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_helper_functions.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_helper_functions.h new file mode 100644 index 0000000..819910e --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_helper_functions.h @@ -0,0 +1,7 @@ +#ifndef KERNEL_HELPER_FUNCTIONS +#define KERNEL_HELPER_FUNCTIONS +#include + +void kernel_debug_plot(std::vector output, bool display_debug); + +#endif /* KERNEL_HELPER_FUNCTIONS */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_h.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_h.cu new file mode 100644 index 0000000..d2adf37 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_h.cu @@ -0,0 +1,64 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_fill_with_h.h" + +__global__ void kernel_phxy_fill_with_h(float* __restrict__ h_memory, + float* __restrict__ phxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t block_dim_c0, + size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 + + position_x * phxy_dim_c2 + position_y] = h_memory[idx_h]; + } +}; + +void occupancy_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_fill_with_h, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_fill_with_h:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_h.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_h.h new file mode 100644 index 0000000..cbe8f5f --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_h.h @@ -0,0 +1,18 @@ +#ifndef KERNEL_PHXY_FILL_WITH_H +#define KERNEL_PHXY_FILL_WITH_H +#include + +__global__ void kernel_phxy_fill_with_h(float* __restrict__ h_memory, + float* __restrict__ phxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t block_dim_c0, + size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_fill_with_h(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PHXY_FILL_WITH_H */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_spike_selected_w.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_spike_selected_w.cu new file mode 100644 index 0000000..0f721e1 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_spike_selected_w.cu @@ -0,0 +1,73 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_fill_with_spike_selected_w.h" + +__global__ void kernel_phxy_fill_with_spike_selected_w( + float* __restrict__ phxy_memory, float* __restrict__ weights_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, + size_t weights_dim_c0, size_t spike_dim_c0, size_t spike_dim_c1, + size_t spike_dim_c2, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + int64_t* spike = spike_memory + pattern_id * spike_dim_c0 + + spike_time * spike_dim_c1 + position_x * spike_dim_c2 + + position_y; + + if (*spike >= 0) { + phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 + + position_x * phxy_dim_c2 + position_y] = + weights_memory[*spike * weights_dim_c0 + idx_h]; + } else { + phxy_memory[pattern_id * phxy_dim_c0 + idx_h * phxy_dim_c1 + + position_x * phxy_dim_c2 + position_y] = 0.0; + } + } +}; + +void occupancy_kernel_phxy_fill_with_spike_selected_w( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + std::vector& output, bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_phxy_fill_with_spike_selected_w, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_fill_with_spike_selected_w:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_spike_selected_w.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_spike_selected_w.h new file mode 100644 index 0000000..3055238 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_fill_with_spike_selected_w.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W +#define KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W +#include + +__global__ void kernel_phxy_fill_with_spike_selected_w( + float* __restrict__ phxy_memory, float* __restrict__ weights_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, + size_t weights_dim_c0, size_t spike_dim_c0, size_t spike_dim_c1, + size_t spike_dim_c2, size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_fill_with_spike_selected_w( + size_t dim_x, size_t dim_y, size_t number_of_pattern, size_t h_dim, + std::vector& output, bool display_debug); + +#endif /* KERNEL_PHXY_FILL_WITH_SPIKE_SELECTED_W */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_one_over_sum_into_pxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_one_over_sum_into_pxy.cu new file mode 100644 index 0000000..475793f --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_one_over_sum_into_pxy.cu @@ -0,0 +1,74 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_one_over_sum_into_pxy.h" + +__global__ void kernel_phxy_one_over_sum_into_pxy( + float* __restrict__ phxy_memory, float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, size_t block_dim_c0, + size_t block_dim_c1, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t position_x = idx / block_dim_c1; + idx -= position_x * block_dim_c1; + size_t position_y = idx; + + size_t offset_phxy_temp = + pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y; + + size_t offset_pxy_sum = + pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y; + + float temp = 0.0; + for (size_t idx_h = 0; idx_h < h_dim; idx_h++) { + temp += phxy_memory[offset_phxy_temp + idx_h * phxy_dim_c1]; + } + if (temp > 1E-10) { + pxy_memory[offset_pxy_sum] = 1.0 / temp; + } else { + pxy_memory[offset_pxy_sum] = 0.0; + } + } +}; + +void occupancy_kernel_phxy_one_over_sum_into_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_phxy_one_over_sum_into_pxy, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_one_over_sum_into_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_one_over_sum_into_pxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_one_over_sum_into_pxy.h new file mode 100644 index 0000000..6f91445 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_one_over_sum_into_pxy.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY +#define KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY +#include + +__global__ void kernel_phxy_one_over_sum_into_pxy( + float* __restrict__ phxy_memory, float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, size_t block_dim_c0, + size_t block_dim_c1, size_t max_idx); + +void occupancy_kernel_phxy_one_over_sum_into_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PHXY_ONE_OVER_SUM_INTO_PXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_phxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_phxy.cu new file mode 100644 index 0000000..f9ee9c2 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_phxy.cu @@ -0,0 +1,51 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_plus_phxy.h" + +__global__ void kernel_phxy_plus_phxy(float* __restrict__ phxy_memory_a, + float* __restrict__ phxy_memory_b, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + phxy_memory_a[idx] += phxy_memory_b[idx]; + } +}; + +void occupancy_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_plus_phxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_plus_phxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_phxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_phxy.h new file mode 100644 index 0000000..a247c90 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_phxy.h @@ -0,0 +1,12 @@ +#ifndef KERNEL_PHXY_PLUS_PHXY +#define KERNEL_PHXY_PLUS_PHXY +#include +__global__ void kernel_phxy_plus_phxy(float* __restrict__ phxy_memory_a, + float* __restrict__ phxy_memory_b, + size_t max_idx); + +void occupancy_kernel_phxy_plus_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PHXY_PLUS_PHXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_pxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_pxy.cu new file mode 100644 index 0000000..56bdac7 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_pxy.cu @@ -0,0 +1,70 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_plus_pxy.h" + +__global__ void kernel_phxy_plus_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + size_t offset_h_temp = + pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y; + + size_t offset_h_sum = + pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y; + + phxy_memory[offset_h_temp + idx_h * phxy_dim_c1] += + pxy_memory[offset_h_sum]; + } +}; + +void occupancy_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_plus_pxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_plus_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_pxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_pxy.h new file mode 100644 index 0000000..3bacc94 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_plus_pxy.h @@ -0,0 +1,16 @@ +#ifndef KERNEL_PHXY_PLUS_PXY +#define KERNEL_PHXY_PLUS_PXY +#include +__global__ void kernel_phxy_plus_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_plus_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PHXY_PLUS_PXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_phxy_equals_phxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_phxy_equals_phxy.cu new file mode 100644 index 0000000..4a91337 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_phxy_equals_phxy.cu @@ -0,0 +1,53 @@ +#include +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_times_phxy_equals_phxy.h" + +__global__ void kernel_phxy_times_phxy_equals_phxy( + float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b, + float* __restrict__ phxy_memory_out, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + phxy_memory_out[idx] = phxy_memory_a[idx] * phxy_memory_b[idx]; + } +}; + +void occupancy_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_phxy_times_phxy_equals_phxy, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_times_phxy_equals_phxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_phxy_equals_phxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_phxy_equals_phxy.h new file mode 100644 index 0000000..b603c2c --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_phxy_equals_phxy.h @@ -0,0 +1,15 @@ +#ifndef KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY +#define KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY +#include + +__global__ void kernel_phxy_times_phxy_equals_phxy( + float* __restrict__ phxy_memory_a, float* __restrict__ phxy_memory_b, + float* __restrict__ phxy_memory_out, size_t max_idx); + +void occupancy_kernel_phxy_times_phxy_equals_phxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PHXY_TIMES_PHXY_EQUALS_PHXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_pxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_pxy.cu new file mode 100644 index 0000000..ed7c64e --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_pxy.cu @@ -0,0 +1,70 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_phxy_times_pxy.h" + +__global__ void kernel_phxy_times_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t idx_h = idx / block_dim_c1; + idx -= idx_h * block_dim_c1; + size_t position_x = idx / block_dim_c2; + idx -= position_x * block_dim_c2; + size_t position_y = idx; + + size_t offset_h_temp = + pattern_id * phxy_dim_c0 + position_x * phxy_dim_c2 + position_y; + + size_t offset_h_sum = + pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + position_y; + + phxy_memory[offset_h_temp + idx_h * phxy_dim_c1] *= + pxy_memory[offset_h_sum]; + } +}; + +void occupancy_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * h_dim * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_phxy_times_pxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_phxy_times_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_pxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_pxy.h new file mode 100644 index 0000000..bad68a9 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_phxy_times_pxy.h @@ -0,0 +1,17 @@ +#ifndef KERNEL_PHXY_TIMES_PXY +#define KERNEL_PHXY_TIMES_PXY +#include + +__global__ void kernel_phxy_times_pxy(float* __restrict__ phxy_memory, + float* __restrict__ pxy_memory, + size_t phxy_dim_c0, size_t phxy_dim_c1, + size_t phxy_dim_c2, size_t h_dim, + size_t pxy_dim_c0, size_t pxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, + size_t block_dim_c2, size_t max_idx); + +void occupancy_kernel_phxy_times_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PHXY_TIMES_PXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_plus_v.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_plus_v.cu new file mode 100644 index 0000000..92d5678 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_plus_v.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_plus_v.h" + +__global__ void kernel_pxy_plus_v(float* __restrict__ pxy_memory, float value, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] += value; + } +}; + +void occupancy_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_plus_v, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_plus_v:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_plus_v.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_plus_v.h new file mode 100644 index 0000000..853a268 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_plus_v.h @@ -0,0 +1,12 @@ +#ifndef KERNEL_PXY_PLUS_V +#define KERNEL_PXY_PLUS_V +#include + +__global__ void kernel_pxy_plus_v(float* __restrict__ pxy_memory, float value, + size_t max_idx); + +void occupancy_kernel_pxy_plus_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); +#endif /* KERNEL_PXY_PLUS_V */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_reciprocal.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_reciprocal.cu new file mode 100644 index 0000000..beac382 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_reciprocal.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_reciprocal.h" + +__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] = 1.0 / pxy_memory[idx]; + } +}; + +void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_reciprocal:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_reciprocal.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_reciprocal.h new file mode 100644 index 0000000..f943914 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_reciprocal.h @@ -0,0 +1,12 @@ +#ifndef KERNEL_PXY_RECIPROCAL +#define KERNEL_PXY_RECIPROCAL +#include +__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory, + size_t max_idx); + +void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_RECIPROCAL */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_set_to_v.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_set_to_v.cu new file mode 100644 index 0000000..4f7b640 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_set_to_v.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_set_to_v.h" + +__global__ void kernel_pxy_set_to_v(float* __restrict__ pxy_memory, float value, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] = value; + } +}; + +void occupancy_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_set_to_v, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_set_to_v:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_set_to_v.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_set_to_v.h new file mode 100644 index 0000000..b2c31d2 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_set_to_v.h @@ -0,0 +1,13 @@ +#ifndef KERNEL_PXY_SET_TO_V +#define KERNEL_PXY_SET_TO_V +#include + +__global__ void kernel_pxy_set_to_v(float* __restrict__ pxy_memory, float value, + size_t max_idx); + +void occupancy_kernel_pxy_set_to_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_SET_TO_V */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_time_pxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_time_pxy.cu new file mode 100644 index 0000000..1349a17 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_time_pxy.cu @@ -0,0 +1,58 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_time_pxy.h" + +// a *= b +__global__ void kernel_pxy_time_pxy(float* __restrict__ pxy_memory_a, + float* __restrict__ pxy_memory_b, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory_a[idx] *= pxy_memory_b[idx]; + } +}; + +void occupancy_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_time_pxy, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + size_t gpu_tuning_factor = 5; + + if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < thread_block_size)) { + thread_block_size = int(gpu_tuning_factor); + } + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_time_pxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_time_pxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_time_pxy.h new file mode 100644 index 0000000..f2c67cb --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_time_pxy.h @@ -0,0 +1,15 @@ +#ifndef KERNEL_PXY_TIME_PXY +#define KERNEL_PXY_TIME_PXY + +#include + +__global__ void kernel_pxy_time_pxy(float* __restrict__ pxy_memory_a, + float* __restrict__ pxy_memory_b, + size_t max_idx); + +void occupancy_kernel_pxy_time_pxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_TIME_PXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_spike_selected_sxy.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_spike_selected_sxy.cu new file mode 100644 index 0000000..c5bae95 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_spike_selected_sxy.cu @@ -0,0 +1,73 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_times_spike_selected_sxy.h" + +__global__ void kernel_pxy_times_spike_selected_sxy( + float* __restrict__ pxy_memory, float* __restrict__ sxy_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, size_t spike_dim_c0, + size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + size_t pattern_id = idx / block_dim_c0; + idx -= pattern_id * block_dim_c0; + size_t position_x = idx / block_dim_c1; + idx -= position_x * block_dim_c1; + size_t position_y = idx; + + int64_t* spike = spike_memory + pattern_id * spike_dim_c0 + + spike_time * spike_dim_c1 + position_x * spike_dim_c2 + + position_y; + + if (*spike >= 0) { + pxy_memory[pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + + position_y] *= + sxy_memory[*spike * sxy_dim_c0 + position_x * sxy_dim_c1 + + position_y]; + } else { + pxy_memory[pattern_id * pxy_dim_c0 + position_x * pxy_dim_c1 + + position_y] = 0; + } + } +}; + +void occupancy_kernel_pxy_times_spike_selected_sxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, + (void*)kernel_pxy_times_spike_selected_sxy, 0, max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_times_spike_selected_sxy:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; \ No newline at end of file diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_spike_selected_sxy.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_spike_selected_sxy.h new file mode 100644 index 0000000..4bc60d5 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_spike_selected_sxy.h @@ -0,0 +1,18 @@ +#ifndef KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY +#define KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY +#include + +__global__ void kernel_pxy_times_spike_selected_sxy( + float* __restrict__ pxy_memory, float* __restrict__ sxy_memory, + int64_t* __restrict__ spike_memory, size_t spike_time, size_t spike_dim_c0, + size_t spike_dim_c1, size_t spike_dim_c2, size_t pxy_dim_c0, + size_t pxy_dim_c1, size_t sxy_dim_c0, size_t sxy_dim_c1, + size_t block_dim_c0, size_t block_dim_c1, size_t max_idx); + +void occupancy_kernel_pxy_times_spike_selected_sxy(size_t dim_x, size_t dim_y, + size_t number_of_pattern, + size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_TIMES_SPIKE_SELECTED_SXY */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_v.cu b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_v.cu new file mode 100644 index 0000000..1823f6f --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_v.cu @@ -0,0 +1,50 @@ +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_times_v.h" + +__global__ void kernel_pxy_times_v(float* __restrict__ pxy_memory, float value, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] *= value; + } +}; + +void occupancy_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_times_v, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_times_v:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_v.h b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_v.h new file mode 100644 index 0000000..611e0ec --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/kernel_pxy_times_v.h @@ -0,0 +1,13 @@ +#ifndef KERNEL_PXY_TIMES_V +#define KERNEL_PXY_TIMES_V +#include + +__global__ void kernel_pxy_times_v(float* __restrict__ pxy_memory, float value, + size_t max_idx); + +void occupancy_kernel_pxy_times_v(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_TIMES_V */ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/HDynamicCNNGPU.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/HDynamicCNNGPU.o new file mode 100644 index 0000000..4ea2bad Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/HDynamicCNNGPU.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/PyHDynamicCNNGPU.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/PyHDynamicCNNGPU.o new file mode 100644 index 0000000..0381a0c Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/PyHDynamicCNNGPU.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_helper_functions.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_helper_functions.o new file mode 100644 index 0000000..1ebde9c Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_helper_functions.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_fill_with_h.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_fill_with_h.o new file mode 100644 index 0000000..9c19be0 Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_fill_with_h.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_fill_with_spike_selected_w.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_fill_with_spike_selected_w.o new file mode 100644 index 0000000..92dd038 Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_fill_with_spike_selected_w.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_one_over_sum_into_pxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_one_over_sum_into_pxy.o new file mode 100644 index 0000000..790103f Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_one_over_sum_into_pxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_plus_phxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_plus_phxy.o new file mode 100644 index 0000000..9b82ae4 Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_plus_phxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_plus_pxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_plus_pxy.o new file mode 100644 index 0000000..0566b8f Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_plus_pxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_times_phxy_equals_phxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_times_phxy_equals_phxy.o new file mode 100644 index 0000000..cdfb851 Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_times_phxy_equals_phxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_times_pxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_times_pxy.o new file mode 100644 index 0000000..a50fd9c Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_phxy_times_pxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_plus_v.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_plus_v.o new file mode 100644 index 0000000..7a14bcd Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_plus_v.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_reciprocal.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_reciprocal.o new file mode 100644 index 0000000..752091a Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_reciprocal.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_set_to_v.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_set_to_v.o new file mode 100644 index 0000000..f0aa1de Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_set_to_v.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_time_pxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_time_pxy.o new file mode 100644 index 0000000..62b841c Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_time_pxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_times_spike_selected_sxy.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_times_spike_selected_sxy.o new file mode 100644 index 0000000..e68b6bb Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_times_spike_selected_sxy.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_times_v.o b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_times_v.o new file mode 100644 index 0000000..c967384 Binary files /dev/null and b/network/h_dynamic_cnn_gpu_cpp_v2_pre/o/kernel_pxy_times_v.o differ diff --git a/network/h_dynamic_cnn_gpu_cpp_v2_pre/test.sh b/network/h_dynamic_cnn_gpu_cpp_v2_pre/test.sh new file mode 100644 index 0000000..0a5f120 --- /dev/null +++ b/network/h_dynamic_cnn_gpu_cpp_v2_pre/test.sh @@ -0,0 +1,14 @@ +make o/kernel_helper_functions.o +make o/kernel_phxy_fill_with_h.o +make o/kernel_phxy_fill_with_spike_selected_w.o +make o/kernel_phxy_one_over_sum_into_pxy.o +make o/kernel_phxy_plus_phxy.o +make o/kernel_phxy_plus_pxy.o +make o/kernel_phxy_times_phxy_equals_phxy.o +make o/kernel_phxy_times_pxy.o +make o/kernel_pxy_plus_v.o +make o/kernel_pxy_reciprocal.o +make o/kernel_pxy_set_to_v.o +make o/kernel_pxy_time_pxy.o +make o/kernel_pxy_times_spike_selected_sxy.o +make o/kernel_pxy_times_v.o