diff --git a/PyBind11/direct/README.md b/PyBind11/direct/README.md index 2b1370c..d1f9862 100644 --- a/PyBind11/direct/README.md +++ b/PyBind11/direct/README.md @@ -129,4 +129,218 @@ assert((input_dim_2 > 0)); assert((input_dim_3 > 0)); ``` +## GPU +### .env File +```Makefile +PYBIN=~/P3.11/bin/ +CC=/usr/lib64/ccache/clang++ +NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler + +PARAMETERS_O_CPU = -O3 -std=c++14 -fPIC -Wall -fopenmp=libomp +PARAMETERS_Linker_CPU = -shared -lm -lomp -lstdc++ -Wall + +PARAMETERS_O_GPU= -O3 -std=c++14 -ccbin=$(CC) \ + -Xcompiler "-fPIC -Wall -fopenmp=libomp" +PARAMETERS_Linker_GPU=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall" + +O_DIRS = o/ +``` + +### Makefile + +```Makefile +include .env +export + +name = HDynamicCNN +type = GPU + +PYPOSTFIX := $(shell $(PYBIN)python3-config --extension-suffix) +PYBIND11INCLUDE := $(shell $(PYBIN)python3 -m pybind11 --includes) +PARAMETERS_O = $(PARAMETERS_O_GPU) $(PYBIND11INCLUDE) +PARAMETERS_Linker = $(PARAMETERS_Linker_GPU) + +so_file = Py$(name)$(type)$(PYPOSTFIX) +pyi_file = Py$(name)$(type).pyi +all: $(so_file) + +$(O_DIRS)$(name)$(type).o: $(name)$(type).h $(name)$(type).cu + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c $(name)$(type).cu -o $(O_DIRS)$(name)$(type).o + +$(O_DIRS)Py$(name)$(type).o: $(name)$(type).h Py$(name)$(type).cpp + mkdir -p $(O_DIRS) + $(NVCC) $(PARAMETERS_O) -c Py$(name)$(type).cpp -o $(O_DIRS)Py$(name)$(type).o + +$(so_file): $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o + $(NVCC) $(PARAMETERS_Linker) -o $(so_file) $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o + + +####################### +clean: + rm -rf $(O_DIRS) + rm -f $(so_file) + rm -f $(pyi_file) +``` + +### .cu File + +Some note: + + +#### CuBlas + +You want to use [cublas](https://docs.nvidia.com/cuda/cublas/index.html) if possible. + +### Create and free memory + +```cpp + cudaError_t status; + float* w_memory = nullptr; + status = cudaMalloc((void**)&w_memory, number_of_elements * sizeof(float)); + assert((status == cudaSuccess)); +``` + +```cpp + status = cudaFree(w_memory); + assert((status == cudaSuccess)); +``` +### Own kernels + + +**Writing a working Cuda Kernel is easy. However, writing a fast one is really really hard. Beside a good Cuda code, you need a good setting for the parallelization parameters. And those depend on the problem AND the GPU you are using.** + +If you really want to make your own GPU kernel, don't forget: + +```cpp + status = cudaDeviceSynchronize(); + assert((status == cudaSuccess)); +``` + +### Example Kernel: + +```cpp + kernel_pxy_reciprocal<<< + dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]), + dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4], + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>( + epsilon_scale_memory, + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]); +``` + +```cpp + occupancy_kernel_pxy_reciprocal( + dim_x, dim_y, number_of_pattern, h_dim, + grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug); +``` + + +#### kernel_pxy_reciprocal.h + +```cpp +#ifndef KERNEL_PXY_RECIPROCAL +#define KERNEL_PXY_RECIPROCAL +#include +__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory, + size_t max_idx); + +void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug); + +#endif /* KERNEL_PXY_RECIPROCAL */ +``` + +#### kernel_pxy_reciprocal.cu + +```cpp +#include +#include + +#include "kernel_helper_functions.h" +#include "kernel_pxy_reciprocal.h" + +__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory, + size_t max_idx) { + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + + if (idx < max_idx) { + pxy_memory[idx] = 1.0 / pxy_memory[idx]; + } +}; + +void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y, + size_t number_of_pattern, size_t h_dim, + std::vector& output, + bool display_debug) { + size_t max_threadable_tasks; + cudaError_t status; + + int min_grid_size; + int thread_block_size; + int grid_size; + + max_threadable_tasks = number_of_pattern * dim_x * dim_y; + + status = cudaOccupancyMaxPotentialBlockSize( + &min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0, + max_threadable_tasks); + assert((status == cudaSuccess)); + + grid_size = + (max_threadable_tasks + thread_block_size - 1) / thread_block_size; + + output.resize(7); + output[0] = grid_size; + output[1] = 1; + output[2] = 1; + output[3] = thread_block_size; + output[4] = 1; + output[5] = 1; + output[6] = max_threadable_tasks; + + if (display_debug == true) { + std::cout << "kernel_pxy_reciprocal:" << std::endl; + kernel_debug_plot(output, display_debug); + } +}; +``` + +#### kernel_helper_functions.h + +```cpp +#ifndef KERNEL_HELPER_FUNCTIONS +#define KERNEL_HELPER_FUNCTIONS +#include + +void kernel_debug_plot(std::vector output, bool display_debug); + +#endif /* KERNEL_HELPER_FUNCTIONS */ +``` + +#### kernel_helper_functions.cu + +```cpp +#include + +#include "kernel_helper_functions.h" + +void kernel_debug_plot(std::vector output, bool display_debug) { + if (display_debug == true) { + std::cout << "grid x: " << output[0] << std::endl; + std::cout << "grid y: " << output[1] << std::endl; + std::cout << "grid z: " << output[2] << std::endl; + std::cout << "thread block x: " << output[3] << std::endl; + std::cout << "thread block y: " << output[4] << std::endl; + std::cout << "thread block z: " << output[5] << std::endl; + std::cout << "max_idx: " << output[6] << std::endl << std::endl; + } + + return; +}; +```