diff --git a/network/HDynamicLayer.py b/network/HDynamicLayer.py
index 9d00f80..33712c1 100644
--- a/network/HDynamicLayer.py
+++ b/network/HDynamicLayer.py
@@ -21,6 +21,8 @@ class HDynamicLayer(torch.nn.Module):
     device: torch.device
     default_dtype: torch.dtype
 
+    _force_forward_h_dynamic_on_cpu: bool
+
     def __init__(
         self,
         output_size: list[int],
@@ -32,6 +34,7 @@ class HDynamicLayer(torch.nn.Module):
         device: torch.device | None = None,
         default_dtype: torch.dtype | None = None,
         gpu_tuning_factor: int = 5,
+        force_forward_h_dynamic_on_cpu: bool = False,
     ) -> None:
         super().__init__()
 
@@ -46,11 +49,14 @@ class HDynamicLayer(torch.nn.Module):
         self._output_size = output_size
         self._output_layer = bool(output_layer)
         self._local_learning = bool(local_learning)
+        self._force_forward_h_dynamic_on_cpu = force_forward_h_dynamic_on_cpu
 
         global_sbs_gpu_setting.append(torch.tensor([0]))
         global_sbs_size.append(torch.tensor([0, 0, 0, 0]))
 
-        if device == torch.device("cpu"):
+        if (device == torch.device("cpu")) or (
+            self._force_forward_h_dynamic_on_cpu is True
+        ):
             global_sbs_hdynamic_cpp.append(HDynamicCNNCPU())
         else:
             global_sbs_hdynamic_cpp.append(HDynamicCNNGPU())
@@ -146,11 +152,6 @@ class FunctionalSbS(torch.autograd.Function):
 
         number_of_spikes: int = int(spikes.shape[1])
 
-        if input.device == torch.device("cpu"):
-            hdyn_number_of_cpu_processes: int = int(parameter_list[0])
-        else:
-            hdyn_number_of_cpu_processes = -1
-
         output_size_0: int = int(parameter_list[1])
         output_size_1: int = int(parameter_list[2])
         gpu_tuning_factor: int = int(parameter_list[3])
@@ -158,6 +159,30 @@ class FunctionalSbS(torch.autograd.Function):
         sbs_gpu_setting_position = int(parameter_list[4])
         sbs_hdynamic_cpp_position = int(parameter_list[5])
 
+        if (
+            isinstance(
+                global_sbs_hdynamic_cpp[sbs_hdynamic_cpp_position], HDynamicCNNCPU
+            )
+            is True
+        ):
+            are_we_on_a_cpu: bool = True
+            work_device: torch.device = torch.device("cpu")
+        else:
+            are_we_on_a_cpu = False
+            work_device = input.device
+
+        target_device: torch.device = input.device
+
+        if target_device == work_device:
+            data_is_on_the_same_device: bool = True
+        else:
+            data_is_on_the_same_device = False
+
+        if are_we_on_a_cpu is True:
+            hdyn_number_of_cpu_processes: int = int(parameter_list[0])
+        else:
+            hdyn_number_of_cpu_processes = -1
+
         # ###########################################################
         # H dynamic
         # ###########################################################
@@ -169,7 +194,7 @@ class FunctionalSbS(torch.autograd.Function):
         # Make space for the results
         # ############################################
 
-        output = torch.empty(
+        output_work: torch.Tensor = torch.empty(
             (
                 int(input.shape[0]),
                 int(weights.shape[1]),
@@ -177,17 +202,43 @@ class FunctionalSbS(torch.autograd.Function):
                 output_size_1,
             ),
             dtype=input.dtype,
-            device=input.device,
+            device=work_device,
         )
 
-        assert output.is_contiguous() is True
+        assert output_work.is_contiguous() is True
         if epsilon_xy is not None:
             assert epsilon_xy.is_contiguous() is True
             assert epsilon_xy.ndim == 3
+            if data_is_on_the_same_device is False:
+                epsilon_xy_work = epsilon_xy.to(work_device)
+            else:
+                epsilon_xy_work = epsilon_xy
+        else:
+            epsilon_xy_work = None
+
         assert epsilon_t_0.is_contiguous() is True
+        if data_is_on_the_same_device is False:
+            epsilon_t_0_work = epsilon_t_0.to(work_device)
+        else:
+            epsilon_t_0_work = epsilon_t_0
+
         assert weights.is_contiguous() is True
+        if data_is_on_the_same_device is False:
+            weights_work = weights.to(work_device)
+        else:
+            weights_work = weights
+
         assert spikes.is_contiguous() is True
+        if data_is_on_the_same_device is False:
+            spikes_work = spikes.to(work_device)
+        else:
+            spikes_work = spikes
+
         assert h_initial.is_contiguous() is True
+        if data_is_on_the_same_device is False:
+            h_initial_work = h_initial.to(work_device)
+        else:
+            h_initial_work = h_initial
 
         assert weights.ndim == 2
         assert h_initial.ndim == 1
@@ -196,32 +247,32 @@ class FunctionalSbS(torch.autograd.Function):
 
         sbs_size = global_sbs_size[sbs_gpu_setting_position].clone()
 
-        if input.device != torch.device("cpu"):
+        if are_we_on_a_cpu is False:
             if (
                 (sbs_profile.numel() == 1)
-                or (sbs_size[0] != int(output.shape[0]))
-                or (sbs_size[1] != int(output.shape[1]))
-                or (sbs_size[2] != int(output.shape[2]))
-                or (sbs_size[3] != int(output.shape[3]))
+                or (sbs_size[0] != int(output_work.shape[0]))
+                or (sbs_size[1] != int(output_work.shape[1]))
+                or (sbs_size[2] != int(output_work.shape[2]))
+                or (sbs_size[3] != int(output_work.shape[3]))
             ):
                 sbs_profile = torch.zeros(
                     (14, 7), dtype=torch.int64, device=torch.device("cpu")
                 )
 
                 global_sbs_hdynamic_cpp[sbs_hdynamic_cpp_position].gpu_occupancy_export(
-                    int(output.shape[2]),
-                    int(output.shape[3]),
-                    int(output.shape[0]),
-                    int(output.shape[1]),
+                    int(output_work.shape[2]),
+                    int(output_work.shape[3]),
+                    int(output_work.shape[0]),
+                    int(output_work.shape[1]),
                     sbs_profile.data_ptr(),
                     int(sbs_profile.shape[0]),
                     int(sbs_profile.shape[1]),
                 )
                 global_sbs_gpu_setting[sbs_gpu_setting_position] = sbs_profile.clone()
-                sbs_size[0] = int(output.shape[0])
-                sbs_size[1] = int(output.shape[1])
-                sbs_size[2] = int(output.shape[2])
-                sbs_size[3] = int(output.shape[3])
+                sbs_size[0] = int(output_work.shape[0])
+                sbs_size[1] = int(output_work.shape[1])
+                sbs_size[2] = int(output_work.shape[2])
+                sbs_size[3] = int(output_work.shape[3])
                 global_sbs_size[sbs_gpu_setting_position] = sbs_size.clone()
 
             else:
@@ -232,32 +283,41 @@ class FunctionalSbS(torch.autograd.Function):
                 )
 
         global_sbs_hdynamic_cpp[sbs_hdynamic_cpp_position].update(
-            output.data_ptr(),
-            int(output.shape[0]),
-            int(output.shape[1]),
-            int(output.shape[2]),
-            int(output.shape[3]),
-            epsilon_xy.data_ptr() if epsilon_xy is not None else int(0),
-            int(epsilon_xy.shape[0]) if epsilon_xy is not None else int(0),
-            int(epsilon_xy.shape[1]) if epsilon_xy is not None else int(0),
-            int(epsilon_xy.shape[2]) if epsilon_xy is not None else int(0),
-            epsilon_t_0.data_ptr(),
-            int(epsilon_t_0.shape[0]),
-            weights.data_ptr(),
-            int(weights.shape[0]),
-            int(weights.shape[1]),
-            spikes.data_ptr(),
-            int(spikes.shape[0]),
-            int(spikes.shape[1]),
-            int(spikes.shape[2]),
-            int(spikes.shape[3]),
-            h_initial.data_ptr(),
-            int(h_initial.shape[0]),
+            output_work.data_ptr(),
+            int(output_work.shape[0]),
+            int(output_work.shape[1]),
+            int(output_work.shape[2]),
+            int(output_work.shape[3]),
+            epsilon_xy_work.data_ptr() if epsilon_xy_work is not None else int(0),
+            int(epsilon_xy_work.shape[0]) if epsilon_xy_work is not None else int(0),
+            int(epsilon_xy_work.shape[1]) if epsilon_xy_work is not None else int(0),
+            int(epsilon_xy_work.shape[2]) if epsilon_xy_work is not None else int(0),
+            epsilon_t_0_work.data_ptr(),
+            int(epsilon_t_0_work.shape[0]),
+            weights_work.data_ptr(),
+            int(weights_work.shape[0]),
+            int(weights_work.shape[1]),
+            spikes_work.data_ptr(),
+            int(spikes_work.shape[0]),
+            int(spikes_work.shape[1]),
+            int(spikes_work.shape[2]),
+            int(spikes_work.shape[3]),
+            h_initial_work.data_ptr(),
+            int(h_initial_work.shape[0]),
             hdyn_number_of_cpu_processes,
             float(forgetting_offset.cpu().item()),
             int(gpu_tuning_factor),
         )
 
+        if data_is_on_the_same_device is False:
+            output = output_work.to(target_device)
+        else:
+            output = output_work
+
+        # print(output)
+        # print(output.sum(dim=1))
+        # print(output.sum(dim=1).shape)
+        # exit()
         # ###########################################################
         # Save the necessary data for the backward pass
         # ###########################################################
diff --git a/network/Parameter.py b/network/Parameter.py
index b9e3924..8847b4e 100644
--- a/network/Parameter.py
+++ b/network/Parameter.py
@@ -142,6 +142,9 @@ class Config:
     epsilon_0: float = field(default=1.0)
     forgetting_offset: float = field(default=-1.0)
 
+    force_forward_h_dynamic_on_cpu: bool = field(default=True)
+    spike_full_layer_input_distribution: list[bool] = field(default_factory=list)
+
     def __post_init__(self) -> None:
         """Post init determines the number of cores.
         Creates the required directory and gives us an optimized
diff --git a/network/SbSLayer.py b/network/SbSLayer.py
index c4db2a8..ae593a6 100644
--- a/network/SbSLayer.py
+++ b/network/SbSLayer.py
@@ -52,7 +52,9 @@ class SbSLayer(torch.nn.Module):
     _reduction_cooldown: float = 1.0
     _layer_id: int = -1
 
-    spike_full_layer_input_distribution: bool = False
+    _spike_full_layer_input_distribution: bool
+
+    _force_forward_h_dynamic_on_cpu: bool
 
     def __init__(
         self,
@@ -81,6 +83,10 @@ class SbSLayer(torch.nn.Module):
         layer_id: int = -1,
         cooldown_after_number_of_spikes: int = -1,
         reduction_cooldown: float = 1.0,
+        force_forward_h_dynamic_on_cpu: bool = True,
+        spike_full_layer_input_distribution: bool = False,
+        force_forward_spike_on_cpu: bool = False,
+        force_forward_spike_output_on_cpu: bool = False,
     ) -> None:
         super().__init__()
 
@@ -109,6 +115,8 @@ class SbSLayer(torch.nn.Module):
         self.reduction_cooldown = float(reduction_cooldown)
         self._layer_id = layer_id
         self._epsilon_xy_use = epsilon_xy_use
+        self._force_forward_h_dynamic_on_cpu = force_forward_h_dynamic_on_cpu
+        self._spike_full_layer_input_distribution = spike_full_layer_input_distribution
 
         assert len(input_size) == 2
         self._input_size = input_size
@@ -140,6 +148,8 @@ class SbSLayer(torch.nn.Module):
             number_of_spikes=self._number_of_spikes,
             number_of_cpu_processes=self._number_of_cpu_processes,
             device=self.device,
+            force_forward_spike_on_cpu=force_forward_spike_on_cpu,
+            force_forward_spike_output_on_cpu=force_forward_spike_output_on_cpu,
         )
 
         self.h_dynamic = HDynamicLayer(
@@ -152,6 +162,7 @@ class SbSLayer(torch.nn.Module):
             device=device,
             default_dtype=self.default_dtype,
             gpu_tuning_factor=gpu_tuning_factor,
+            force_forward_h_dynamic_on_cpu=self._force_forward_h_dynamic_on_cpu,
         )
 
         assert len(input_size) >= 2
@@ -169,10 +180,6 @@ class SbSLayer(torch.nn.Module):
             number_of_cpu_processes=number_of_cpu_processes,
         )
 
-        # TODO: TEST
-        if layer_id == 0:
-            self.spike_full_layer_input_distribution = True
-
         # ###############################################################
         # Initialize the weights
         # ###############################################################
@@ -438,7 +445,7 @@ class SbSLayer(torch.nn.Module):
         else:
             assert self._epsilon_xy is None
 
-        if self.spike_full_layer_input_distribution is False:
+        if self._spike_full_layer_input_distribution is False:
             spike = self.spike_generator(input_convolved, int(self._number_of_spikes))
         else:
             input_shape = input.shape
@@ -457,7 +464,9 @@ class SbSLayer(torch.nn.Module):
                     (input_shape[0], input_shape[1], input_shape[2], input_shape[3])
                 )
             )
-            spike = self.spikes_sorter(spike_unsorted).to(device=input_convolved.device)
+            spike = self.spikes_sorter(spike_unsorted)
+            if self._force_forward_h_dynamic_on_cpu is False:
+                spike = spike.to(device=input_convolved.device)
 
         output = self.h_dynamic(
             input=input_convolved,
diff --git a/network/SpikeLayer.py b/network/SpikeLayer.py
index 25a86d6..82b7d42 100644
--- a/network/SpikeLayer.py
+++ b/network/SpikeLayer.py
@@ -15,12 +15,16 @@ class SpikeLayer(torch.nn.Module):
     _number_of_cpu_processes: int
     _number_of_spikes: int
     device: torch.device
+    _force_forward_spike_on_cpu: bool
+    _force_forward_spike_output_on_cpu: bool
 
     def __init__(
         self,
         number_of_spikes: int = -1,
         number_of_cpu_processes: int = 1,
         device: torch.device | None = None,
+        force_forward_spike_on_cpu: bool = False,
+        force_forward_spike_output_on_cpu: bool = False,
     ) -> None:
         super().__init__()
 
@@ -29,11 +33,15 @@ class SpikeLayer(torch.nn.Module):
 
         self._number_of_cpu_processes = number_of_cpu_processes
         self._number_of_spikes = number_of_spikes
+        self._force_forward_spike_on_cpu = force_forward_spike_on_cpu
+        self._force_forward_spike_output_on_cpu = force_forward_spike_output_on_cpu
 
         global_spike_generation_gpu_setting.append(torch.tensor([0]))
         global_spike_size.append(torch.tensor([0, 0, 0, 0]))
 
-        if device == torch.device("cpu"):
+        if (device == torch.device("cpu")) or (
+            self._force_forward_spike_on_cpu is True
+        ):
             global_spike_generation_cpp.append(SpikeGenerationCPU())
         else:
             global_spike_generation_cpp.append(SpikeGenerationGPU())
@@ -66,6 +74,7 @@ class SpikeLayer(torch.nn.Module):
                 int(self._spike_generation_cpp_position),  # 1
                 int(self._spike_generation_gpu_setting_position),  # 2
                 int(number_of_spikes),  # 3
+                int(self._force_forward_spike_output_on_cpu),  # 4
             ],
             dtype=torch.int64,
         )
@@ -83,14 +92,35 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
 
         assert input.dim() == 4
 
-        if input.device == torch.device("cpu"):
-            spike_number_of_cpu_processes: int = int(parameter_list[0])
-        else:
-            spike_number_of_cpu_processes = -1
-
         spike_generation_cpp_position = int(parameter_list[1])
         spike_generation_gpu_setting_position = int(parameter_list[2])
         number_of_spikes: int = int(parameter_list[3])
+        force_forward_spike_output_on_cpu: bool = bool(parameter_list[4])
+
+        if (
+            isinstance(
+                global_spike_generation_cpp[spike_generation_cpp_position],
+                SpikeGenerationCPU,
+            )
+            is True
+        ):
+            are_we_on_a_cpu: bool = True
+            work_device: torch.device = torch.device("cpu")
+        else:
+            are_we_on_a_cpu = False
+            work_device = input.device
+
+        target_device: torch.device = input.device
+
+        if target_device == work_device:
+            data_is_on_the_same_device: bool = True
+        else:
+            data_is_on_the_same_device = False
+
+        if are_we_on_a_cpu is True:
+            spike_number_of_cpu_processes: int = int(parameter_list[0])
+        else:
+            spike_number_of_cpu_processes = -1
 
         # ###########################################################
         # Spike generation
@@ -100,7 +130,12 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
         # Normalized cumsum
         # (beware of the pytorch bug! Thus .clone()!)
         # ############################################
-        input_cumsum: torch.Tensor = torch.cumsum(input, dim=1, dtype=input.dtype)
+        if data_is_on_the_same_device is False:
+            input_work = input.to(work_device)
+        else:
+            input_work = input
+        # input_work = input
+        input_cumsum: torch.Tensor = torch.cumsum(input_work, dim=1, dtype=input.dtype)
         input_cumsum_last: torch.Tensor = input_cumsum[:, -1, :, :].unsqueeze(1).clone()
         input_cumsum /= input_cumsum_last
 
@@ -115,17 +150,19 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
                 input_cumsum.shape[3],
             ],
             dtype=input.dtype,
-            device=input.device,
+            device=work_device,
         )
 
         # ############################################
         # Make space for the results
         # ############################################
-        spikes = torch.empty_like(random_values, dtype=torch.int64, device=input.device)
+        spikes_work = torch.empty_like(
+            random_values, dtype=torch.int64, device=work_device
+        )
 
         assert input_cumsum.is_contiguous() is True
         assert random_values.is_contiguous() is True
-        assert spikes.is_contiguous() is True
+        assert spikes_work.is_contiguous() is True
 
         # time_start: float = time.perf_counter()
         spike_generation_profile = global_spike_generation_gpu_setting[
@@ -136,19 +173,13 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
             spike_generation_gpu_setting_position
         ].clone()
 
-        if (
-            isinstance(
-                global_spike_generation_cpp[spike_generation_cpp_position],
-                SpikeGenerationGPU,
-            )
-            is True
-        ):
+        if are_we_on_a_cpu is False:
             if (
                 (spike_generation_profile.numel() == 1)
-                or (spike_generation_size[0] != int(spikes.shape[0]))
-                or (spike_generation_size[1] != int(spikes.shape[1]))
-                or (spike_generation_size[2] != int(spikes.shape[2]))
-                or (spike_generation_size[3] != int(spikes.shape[3]))
+                or (spike_generation_size[0] != int(spikes_work.shape[0]))
+                or (spike_generation_size[1] != int(spikes_work.shape[1]))
+                or (spike_generation_size[2] != int(spikes_work.shape[2]))
+                or (spike_generation_size[3] != int(spikes_work.shape[3]))
             ):
 
                 spike_generation_profile = torch.zeros(
@@ -157,10 +188,10 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
                 global_spike_generation_cpp[
                     spike_generation_cpp_position
                 ].gpu_occupancy_export(
-                    int(spikes.shape[2]),
-                    int(spikes.shape[3]),
-                    int(spikes.shape[0]),
-                    int(spikes.shape[1]),
+                    int(spikes_work.shape[2]),
+                    int(spikes_work.shape[3]),
+                    int(spikes_work.shape[0]),
+                    int(spikes_work.shape[1]),
                     spike_generation_profile.data_ptr(),
                     int(spike_generation_profile.shape[0]),
                     int(spike_generation_profile.shape[1]),
@@ -169,10 +200,10 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
                     spike_generation_gpu_setting_position
                 ] = spike_generation_profile.clone()
 
-                spike_generation_size[0] = int(spikes.shape[0])
-                spike_generation_size[1] = int(spikes.shape[1])
-                spike_generation_size[2] = int(spikes.shape[2])
-                spike_generation_size[3] = int(spikes.shape[3])
+                spike_generation_size[0] = int(spikes_work.shape[0])
+                spike_generation_size[1] = int(spikes_work.shape[1])
+                spike_generation_size[2] = int(spikes_work.shape[2])
+                spike_generation_size[3] = int(spikes_work.shape[3])
                 global_spike_size[
                     spike_generation_gpu_setting_position
                 ] = spike_generation_size.clone()
@@ -197,15 +228,20 @@ class FunctionalSpikeGeneration(torch.autograd.Function):
             int(random_values.shape[1]),
             int(random_values.shape[2]),
             int(random_values.shape[3]),
-            spikes.data_ptr(),
-            int(spikes.shape[0]),
-            int(spikes.shape[1]),
-            int(spikes.shape[2]),
-            int(spikes.shape[3]),
+            spikes_work.data_ptr(),
+            int(spikes_work.shape[0]),
+            int(spikes_work.shape[1]),
+            int(spikes_work.shape[2]),
+            int(spikes_work.shape[3]),
             int(spike_number_of_cpu_processes),
         )
-        del random_values
-        del input_cumsum
+
+        if (force_forward_spike_output_on_cpu is True) and (are_we_on_a_cpu is True):
+            spikes = spikes_work
+        elif data_is_on_the_same_device is False:
+            spikes = spikes_work.to(target_device)
+        else:
+            spikes = spikes_work
 
         return spikes
 
diff --git a/network/build_network.py b/network/build_network.py
index 532116e..ed95395 100644
--- a/network/build_network.py
+++ b/network/build_network.py
@@ -120,6 +120,12 @@ def build_network(
                 cfg.learning_parameters.sbs_skip_gradient_calculation[0]
             )
 
+        spike_full_layer_input_distribution: bool = False
+        if len(cfg.spike_full_layer_input_distribution) > layer_id:
+            spike_full_layer_input_distribution = (
+                cfg.spike_full_layer_input_distribution[layer_id]
+            )
+
         # #############################################################
         # SbS layer:
         # #############################################################
@@ -138,7 +144,10 @@ def build_network(
             assert number_of_spikes > 0
 
             logging.info(
-                f"Layer: {layer_id} -> SbS Layer with {number_of_spikes} spikes"
+                (
+                    f"Layer: {layer_id} -> SbS Layer with {number_of_spikes} spikes "
+                    f"-- draw spike from full layer: {spike_full_layer_input_distribution}"
+                )
             )
             is_pooling_layer: bool = False
             if cfg.network_structure.layer_type[layer_id].upper().find("POOLING") != -1:
@@ -169,6 +178,8 @@ def build_network(
                     layer_id=layer_id,
                     cooldown_after_number_of_spikes=cfg.cooldown_after_number_of_spikes,
                     reduction_cooldown=cfg.reduction_cooldown,
+                    force_forward_h_dynamic_on_cpu=cfg.force_forward_h_dynamic_on_cpu,
+                    spike_full_layer_input_distribution=spike_full_layer_input_distribution,
                 )
             )
             # Adding the x,y output dimensions