diff --git a/PyBind11/direct/README.md b/PyBind11/direct/README.md
index 2b1370c..d1f9862 100644
--- a/PyBind11/direct/README.md
+++ b/PyBind11/direct/README.md
@@ -129,4 +129,218 @@ assert((input_dim_2 > 0));
 assert((input_dim_3 > 0));
 ```
 
+## GPU
 
+### .env File
+```Makefile
+PYBIN=~/P3.11/bin/
+CC=/usr/lib64/ccache/clang++
+NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler 
+
+PARAMETERS_O_CPU = -O3 -std=c++14 -fPIC -Wall -fopenmp=libomp
+PARAMETERS_Linker_CPU = -shared -lm -lomp -lstdc++ -Wall
+
+PARAMETERS_O_GPU= -O3 -std=c++14 -ccbin=$(CC) \
+    -Xcompiler "-fPIC -Wall -fopenmp=libomp"   
+PARAMETERS_Linker_GPU=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall"
+
+O_DIRS = o/
+```
+
+### Makefile
+
+```Makefile
+include .env
+export
+
+name = HDynamicCNN
+type = GPU
+
+PYPOSTFIX := $(shell $(PYBIN)python3-config --extension-suffix)
+PYBIND11INCLUDE := $(shell $(PYBIN)python3 -m pybind11 --includes)
+PARAMETERS_O = $(PARAMETERS_O_GPU) $(PYBIND11INCLUDE) 
+PARAMETERS_Linker = $(PARAMETERS_Linker_GPU)
+
+so_file = Py$(name)$(type)$(PYPOSTFIX)
+pyi_file = Py$(name)$(type).pyi
+all: $(so_file)
+
+$(O_DIRS)$(name)$(type).o: $(name)$(type).h $(name)$(type).cu
+	mkdir -p $(O_DIRS) 
+	$(NVCC) $(PARAMETERS_O) -c $(name)$(type).cu -o $(O_DIRS)$(name)$(type).o
+
+$(O_DIRS)Py$(name)$(type).o: $(name)$(type).h Py$(name)$(type).cpp 
+	mkdir -p $(O_DIRS)
+	$(NVCC) $(PARAMETERS_O) -c Py$(name)$(type).cpp -o $(O_DIRS)Py$(name)$(type).o
+
+$(so_file): $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o
+	$(NVCC) $(PARAMETERS_Linker) -o $(so_file) $(O_DIRS)$(name)$(type).o $(O_DIRS)Py$(name)$(type).o
+
+
+#######################
+clean:
+	rm -rf $(O_DIRS)
+	rm -f $(so_file)
+	rm -f $(pyi_file)
+```
+
+### .cu File
+
+Some note: 
+
+
+#### CuBlas
+
+You want to use [cublas](https://docs.nvidia.com/cuda/cublas/index.html) if possible. 
+
+### Create and free memory
+
+```cpp
+    cudaError_t status;
+    float* w_memory = nullptr;
+    status = cudaMalloc((void**)&w_memory, number_of_elements * sizeof(float));
+    assert((status == cudaSuccess));
+```
+
+```cpp
+    status = cudaFree(w_memory);
+    assert((status == cudaSuccess));
+```
+### Own kernels
+
+
+**Writing a working Cuda Kernel is easy. However, writing a fast one is really really hard. Beside a good Cuda code, you need a good setting for the parallelization parameters. And those depend on the problem AND the GPU you are using.**
+
+If you really want to make your own GPU kernel, don't forget: 
+
+```cpp
+    status = cudaDeviceSynchronize();
+    assert((status == cudaSuccess));
+```
+
+### Example Kernel: 
+
+```cpp
+            kernel_pxy_reciprocal<<<
+                dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][0],
+                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][1],
+                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][2]),
+                dim3(grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][3],
+                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][4],
+                    grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][5])>>>(
+                        epsilon_scale_memory,
+                        grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL][6]);
+```
+
+```cpp
+    occupancy_kernel_pxy_reciprocal(
+        dim_x, dim_y, number_of_pattern, h_dim,
+        grid_and_thread_settings[ID_KERNEL_PXY_RECIPROCAL], display_debug);
+```
+
+
+#### kernel_pxy_reciprocal.h
+
+```cpp
+#ifndef KERNEL_PXY_RECIPROCAL
+#define KERNEL_PXY_RECIPROCAL
+#include <vector>
+__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
+                                      size_t max_idx);
+
+void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug);
+
+#endif /* KERNEL_PXY_RECIPROCAL */
+```
+
+#### kernel_pxy_reciprocal.cu
+
+```cpp
+#include <cassert>
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+#include "kernel_pxy_reciprocal.h"
+
+__global__ void kernel_pxy_reciprocal(float* __restrict__ pxy_memory,
+                                      size_t max_idx) {
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx < max_idx) {
+    pxy_memory[idx] = 1.0 / pxy_memory[idx];
+  }
+};
+
+void occupancy_kernel_pxy_reciprocal(size_t dim_x, size_t dim_y,
+                                     size_t number_of_pattern, size_t h_dim,
+                                     std::vector<size_t>& output,
+                                     bool display_debug) {
+  size_t max_threadable_tasks;
+  cudaError_t status;
+
+  int min_grid_size;
+  int thread_block_size;
+  int grid_size;
+
+  max_threadable_tasks = number_of_pattern * dim_x * dim_y;
+
+  status = cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &thread_block_size, (void*)kernel_pxy_reciprocal, 0,
+      max_threadable_tasks);
+  assert((status == cudaSuccess));
+
+  grid_size =
+      (max_threadable_tasks + thread_block_size - 1) / thread_block_size;
+
+  output.resize(7);
+  output[0] = grid_size;
+  output[1] = 1;
+  output[2] = 1;
+  output[3] = thread_block_size;
+  output[4] = 1;
+  output[5] = 1;
+  output[6] = max_threadable_tasks;
+
+  if (display_debug == true) {
+    std::cout << "kernel_pxy_reciprocal:" << std::endl;
+    kernel_debug_plot(output, display_debug);
+  }
+};
+```
+
+#### kernel_helper_functions.h
+
+```cpp
+#ifndef KERNEL_HELPER_FUNCTIONS
+#define KERNEL_HELPER_FUNCTIONS
+#include <vector>
+
+void kernel_debug_plot(std::vector<size_t> output, bool display_debug);
+
+#endif /* KERNEL_HELPER_FUNCTIONS */
+```
+
+#### kernel_helper_functions.cu 
+
+```cpp
+#include <iostream>
+
+#include "kernel_helper_functions.h"
+
+void kernel_debug_plot(std::vector<size_t> output, bool display_debug) {
+  if (display_debug == true) {
+    std::cout << "grid x: " << output[0] << std::endl;
+    std::cout << "grid y: " << output[1] << std::endl;
+    std::cout << "grid z: " << output[2] << std::endl;
+    std::cout << "thread block x: " << output[3] << std::endl;
+    std::cout << "thread block y: " << output[4] << std::endl;
+    std::cout << "thread block z: " << output[5] << std::endl;
+    std::cout << "max_idx: " << output[6] << std::endl << std::endl;
+  }
+
+  return;
+};
+```