Update README.md

Signed-off-by: David Rotermund <54365609+davrot@users.noreply.github.com>
2025-06-06 22:00:01 +02:00 · 2024-01-03 17:09:08 +01:00 · 2024-01-03 17:09:08 +01:00 · 8b5f36e9cb
commit 8b5f36e9cb
parent 1fe8847737
1 changed files with 214 additions and 0 deletions
--- a/PyBind11/direct/README.md
+++ b/PyBind11/direct/README.md
@ -129,4 +129,218 @@ assert((input_dim_2 > 0));
 assert((input_dim_3 > 0));
 ```
 ## GPU
 ### .env File
 ```Makefile
 PYBIN=~/P3.11/bin/
 CC=/usr/lib64/ccache/clang++
 NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler 
 PARAMETERS_O_CPU = -O3 -std=c++14 -fPIC -Wall -fopenmp=libomp
 PARAMETERS_Linker_CPU = -shared -lm -lomp -lstdc++ -Wall
 PARAMETERS_O_GPU= -O3 -std=c++14 -ccbin=$(CC) \
    -Xcompiler "-fPIC -Wall -fopenmp=libomp"   
 PARAMETERS_Linker_GPU=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall"
 O_DIRS = o/
 ```
 ### Makefile
 ```Makefile
 include .env
 export
 name = HDynamicCNN
 type = GPU
 PYPOSTFIX := $(shell $(PYBIN)python3-config --extension-suffix)
 PYBIND11INCLUDE := $(shell $(PYBIN)python3 -m pybind11 --includes)
 PARAMETERS_O = $(PARAMETERS_O_GPU) $(PYBIND11INCLUDE) 
 PARAMETERS_Linker = $(PARAMETERS_Linker_GPU)
 so_file = Py$(name)$(type)$(PYPOSTFIX)
 pyi_file = Py$(name)$(type).pyi
 all: $(so_file)
 $(O_DIRS)$(name)$(type).o: $(name)$(type).h $(name)$(type).cu
 	mkdir -p $(O_DIRS) 
 	$(NVCC) $(PARAMETERS_O) -c $(name)$(type).cu -o $(O_DIRS)$(name)$(type).o
 $(O_DIRS)Py$(name)$(type).o: $(name)$(type).h Py$(name)$(type).cpp 
 	mkdir -p $(O_DIRS)
 	$(NVCC) $(PARAMETERS_O) -c Py$(name)$(type).cpp -o $(O_DIRS)Py$(name)$(type).o
 $(so_file): $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o
 	$(NVCC) $(PARAMETERS_Linker) -o $(so_file) $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o
 #######################
 clean:
 	rm -rf $(O_DIRS)
 	rm -f $(so_file)
 	rm -f $(pyi_file)
 ```
 ### .cu File
 Some note: 
 #### CuBlas
 You want to use [cublas](https://docs.nvidia.com/cuda/cublas/index.html) if possible. 
 ### Create and free memory
 ```cpp
    cudaError_t status;
    float* w_memory = nullptr;
    status = cudaMalloc((void**)&w_memory, number_of_elements * sizeof(float));
    assert((status == cudaSuccess));
 ```
 ```cpp
    status = cudaFree(w_memory);
    assert((status == cudaSuccess));
 ```
 ### Own kernels
 **Writing a working Cuda Kernel is easy. However, writing a fast one is really really hard. Beside a good Cuda code, you need a good setting for the parallelization parameters. And those depend on the problem AND the GPU you are using.**
 If you really want to make your own GPU kernel, don't forget: 
 ```cpp
    status = cudaDeviceSynchronize();
    assert((status == cudaSuccess));
 ```
 ### Example Kernel: 
 ```cpp
            kernel_pxy_reciprocal<<<
                dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0],
                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1],
                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]),
                dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3],
                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4],
                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>(
                        epsilon_scale_memory,
                        grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]);
 ```
 ```cpp
    occupancy_kernel_pxy_reciprocal(
        dim_x, dim_y, number_of_pattern, h_dim,
        grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug);
 ```
 #### kernel_pxy_reciprocal.h
 ```cpp
 #ifndef KERNEL_PXY_RECIPROCAL
 #define KERNEL_PXY_RECIPROCAL
 #include <vector>
 __global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
                                      size_t max_idx);
 void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
                                     size_t number_of_pattern, size_t h_dim,
                                     std::vector<size_t>& output,
                                     bool display_debug);
 #endif /* KERNEL_PXY_RECIPROCAL */
 ```
 #### kernel_pxy_reciprocal.cu
 ```cpp
 #include <cassert>
 #include <iostream>
 #include "kernel_helper_functions.h"
 #include "kernel_pxy_reciprocal.h"
 __global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
                                      size_t max_idx) {
  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < max_idx) {
    pxy_memory[idx] = 1.0 / pxy_memory[idx];
  }
 };
 void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
                                     size_t number_of_pattern, size_t h_dim,
                                     std::vector<size_t>& output,
                                     bool display_debug) {
  size_t max_threadable_tasks;
  cudaError_t status;
  int min_grid_size;
  int thread_block_size;
  int grid_size;
  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
  status = cudaOccupancyMaxPotentialBlockSize(
      &min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0,
      max_threadable_tasks);
  assert((status == cudaSuccess));
  grid_size =
      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
  output.resize(7);
  output[0] = grid_size;
  output[1] = 1;
  output[2] = 1;
  output[3] = thread_block_size;
  output[4] = 1;
  output[5] = 1;
  output[6] = max_threadable_tasks;
  if (display_debug == true) {
    std::cout << "kernel_pxy_reciprocal:" << std::endl;
    kernel_debug_plot(output, display_debug);
  }
 };
 ```
 #### kernel_helper_functions.h
 ```cpp
 #ifndef KERNEL_HELPER_FUNCTIONS
 #define KERNEL_HELPER_FUNCTIONS
 #include <vector>
 void kernel_debug_plot(std::vector<size_t> output, bool display_debug);
 #endif /* KERNEL_HELPER_FUNCTIONS */
 ```
 #### kernel_helper_functions.cu 
 ```cpp
 #include <iostream>
 #include "kernel_helper_functions.h"
 void kernel_debug_plot(std::vector<size_t> output, bool display_debug) {
  if (display_debug == true) {
    std::cout << "grid x: " << output[0] << std::endl;
    std::cout << "grid y: " << output[1] << std::endl;
    std::cout << "grid z: " << output[2] << std::endl;
    std::cout << "thread block x: " << output[3] << std::endl;
    std::cout << "thread block y: " << output[4] << std::endl;
    std::cout << "thread block z: " << output[5] << std::endl;
    std::cout << "max_idx: " << output[6] << std::endl << std::endl;
  }
  return;
 };
 ```