Update README.md
Signed-off-by: David Rotermund <54365609+davrot@users.noreply.github.com>
This commit is contained in:
parent
1fe8847737
commit
8b5f36e9cb
1 changed files with 214 additions and 0 deletions
|
@ -129,4 +129,218 @@ assert((input_dim_2 > 0));
|
||||||
assert((input_dim_3 > 0));
|
assert((input_dim_3 > 0));
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## GPU
|
||||||
|
|
||||||
|
### .env File
|
||||||
|
```Makefile
|
||||||
|
PYBIN=~/P3.11/bin/
|
||||||
|
CC=/usr/lib64/ccache/clang++
|
||||||
|
NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler
|
||||||
|
|
||||||
|
PARAMETERS_O_CPU = -O3 -std=c++14 -fPIC -Wall -fopenmp=libomp
|
||||||
|
PARAMETERS_Linker_CPU = -shared -lm -lomp -lstdc++ -Wall
|
||||||
|
|
||||||
|
PARAMETERS_O_GPU= -O3 -std=c++14 -ccbin=$(CC) \
|
||||||
|
-Xcompiler "-fPIC -Wall -fopenmp=libomp"
|
||||||
|
PARAMETERS_Linker_GPU=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall"
|
||||||
|
|
||||||
|
O_DIRS = o/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Makefile
|
||||||
|
|
||||||
|
```Makefile
|
||||||
|
include .env
|
||||||
|
export
|
||||||
|
|
||||||
|
name = HDynamicCNN
|
||||||
|
type = GPU
|
||||||
|
|
||||||
|
PYPOSTFIX := $(shell $(PYBIN)python3-config --extension-suffix)
|
||||||
|
PYBIND11INCLUDE := $(shell $(PYBIN)python3 -m pybind11 --includes)
|
||||||
|
PARAMETERS_O = $(PARAMETERS_O_GPU) $(PYBIND11INCLUDE)
|
||||||
|
PARAMETERS_Linker = $(PARAMETERS_Linker_GPU)
|
||||||
|
|
||||||
|
so_file = Py$(name)$(type)$(PYPOSTFIX)
|
||||||
|
pyi_file = Py$(name)$(type).pyi
|
||||||
|
all: $(so_file)
|
||||||
|
|
||||||
|
$(O_DIRS)$(name)$(type).o: $(name)$(type).h $(name)$(type).cu
|
||||||
|
mkdir -p $(O_DIRS)
|
||||||
|
$(NVCC) $(PARAMETERS_O) -c $(name)$(type).cu -o $(O_DIRS)$(name)$(type).o
|
||||||
|
|
||||||
|
$(O_DIRS)Py$(name)$(type).o: $(name)$(type).h Py$(name)$(type).cpp
|
||||||
|
mkdir -p $(O_DIRS)
|
||||||
|
$(NVCC) $(PARAMETERS_O) -c Py$(name)$(type).cpp -o $(O_DIRS)Py$(name)$(type).o
|
||||||
|
|
||||||
|
$(so_file): $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o
|
||||||
|
$(NVCC) $(PARAMETERS_Linker) -o $(so_file) $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o
|
||||||
|
|
||||||
|
|
||||||
|
#######################
|
||||||
|
clean:
|
||||||
|
rm -rf $(O_DIRS)
|
||||||
|
rm -f $(so_file)
|
||||||
|
rm -f $(pyi_file)
|
||||||
|
```
|
||||||
|
|
||||||
|
### .cu File
|
||||||
|
|
||||||
|
Some note:
|
||||||
|
|
||||||
|
|
||||||
|
#### CuBlas
|
||||||
|
|
||||||
|
You want to use [cublas](https://docs.nvidia.com/cuda/cublas/index.html) if possible.
|
||||||
|
|
||||||
|
### Create and free memory
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
cudaError_t status;
|
||||||
|
float* w_memory = nullptr;
|
||||||
|
status = cudaMalloc((void**)&w_memory, number_of_elements * sizeof(float));
|
||||||
|
assert((status == cudaSuccess));
|
||||||
|
```
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
status = cudaFree(w_memory);
|
||||||
|
assert((status == cudaSuccess));
|
||||||
|
```
|
||||||
|
### Own kernels
|
||||||
|
|
||||||
|
|
||||||
|
**Writing a working Cuda Kernel is easy. However, writing a fast one is really really hard. Beside a good Cuda code, you need a good setting for the parallelization parameters. And those depend on the problem AND the GPU you are using.**
|
||||||
|
|
||||||
|
If you really want to make your own GPU kernel, don't forget:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
status = cudaDeviceSynchronize();
|
||||||
|
assert((status == cudaSuccess));
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Kernel:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
kernel_pxy_reciprocal<<<
|
||||||
|
dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0],
|
||||||
|
grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1],
|
||||||
|
grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]),
|
||||||
|
dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3],
|
||||||
|
grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4],
|
||||||
|
grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>(
|
||||||
|
epsilon_scale_memory,
|
||||||
|
grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]);
|
||||||
|
```
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
occupancy_kernel_pxy_reciprocal(
|
||||||
|
dim_x, dim_y, number_of_pattern, h_dim,
|
||||||
|
grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug);
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### kernel_pxy_reciprocal.h
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#ifndef KERNEL_PXY_RECIPROCAL
|
||||||
|
#define KERNEL_PXY_RECIPROCAL
|
||||||
|
#include <vector>
|
||||||
|
__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
|
||||||
|
size_t max_idx);
|
||||||
|
|
||||||
|
void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
|
||||||
|
size_t number_of_pattern, size_t h_dim,
|
||||||
|
std::vector<size_t>& output,
|
||||||
|
bool display_debug);
|
||||||
|
|
||||||
|
#endif /* KERNEL_PXY_RECIPROCAL */
|
||||||
|
```
|
||||||
|
|
||||||
|
#### kernel_pxy_reciprocal.cu
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "kernel_helper_functions.h"
|
||||||
|
#include "kernel_pxy_reciprocal.h"
|
||||||
|
|
||||||
|
__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
|
||||||
|
size_t max_idx) {
|
||||||
|
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
|
if (idx < max_idx) {
|
||||||
|
pxy_memory[idx] = 1.0 / pxy_memory[idx];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
|
||||||
|
size_t number_of_pattern, size_t h_dim,
|
||||||
|
std::vector<size_t>& output,
|
||||||
|
bool display_debug) {
|
||||||
|
size_t max_threadable_tasks;
|
||||||
|
cudaError_t status;
|
||||||
|
|
||||||
|
int min_grid_size;
|
||||||
|
int thread_block_size;
|
||||||
|
int grid_size;
|
||||||
|
|
||||||
|
max_threadable_tasks = number_of_pattern * dim_x * dim_y;
|
||||||
|
|
||||||
|
status = cudaOccupancyMaxPotentialBlockSize(
|
||||||
|
&min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0,
|
||||||
|
max_threadable_tasks);
|
||||||
|
assert((status == cudaSuccess));
|
||||||
|
|
||||||
|
grid_size =
|
||||||
|
(max_threadable_tasks + thread_block_size - 1) / thread_block_size;
|
||||||
|
|
||||||
|
output.resize(7);
|
||||||
|
output[0] = grid_size;
|
||||||
|
output[1] = 1;
|
||||||
|
output[2] = 1;
|
||||||
|
output[3] = thread_block_size;
|
||||||
|
output[4] = 1;
|
||||||
|
output[5] = 1;
|
||||||
|
output[6] = max_threadable_tasks;
|
||||||
|
|
||||||
|
if (display_debug == true) {
|
||||||
|
std::cout << "kernel_pxy_reciprocal:" << std::endl;
|
||||||
|
kernel_debug_plot(output, display_debug);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
#### kernel_helper_functions.h
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#ifndef KERNEL_HELPER_FUNCTIONS
|
||||||
|
#define KERNEL_HELPER_FUNCTIONS
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
void kernel_debug_plot(std::vector<size_t> output, bool display_debug);
|
||||||
|
|
||||||
|
#endif /* KERNEL_HELPER_FUNCTIONS */
|
||||||
|
```
|
||||||
|
|
||||||
|
#### kernel_helper_functions.cu
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "kernel_helper_functions.h"
|
||||||
|
|
||||||
|
void kernel_debug_plot(std::vector<size_t> output, bool display_debug) {
|
||||||
|
if (display_debug == true) {
|
||||||
|
std::cout << "grid x: " << output[0] << std::endl;
|
||||||
|
std::cout << "grid y: " << output[1] << std::endl;
|
||||||
|
std::cout << "grid z: " << output[2] << std::endl;
|
||||||
|
std::cout << "thread block x: " << output[3] << std::endl;
|
||||||
|
std::cout << "thread block y: " << output[4] << std::endl;
|
||||||
|
std::cout << "thread block z: " << output[5] << std::endl;
|
||||||
|
std::cout << "max_idx: " << output[6] << std::endl << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
Loading…
Reference in a new issue