diff --git a/network/HDynamicLayer.py b/network/HDynamicLayer.py index 9d00f80..33712c1 100644 --- a/network/HDynamicLayer.py +++ b/network/HDynamicLayer.py @@ -21,6 +21,8 @@ class HDynamicLayer(torch.nn.Module): device: torch.device default_dtype: torch.dtype + _force_forward_h_dynamic_on_cpu: bool + def __init__( self, output_size: list[int], @@ -32,6 +34,7 @@ class HDynamicLayer(torch.nn.Module): device: torch.device | None = None, default_dtype: torch.dtype | None = None, gpu_tuning_factor: int = 5, + force_forward_h_dynamic_on_cpu: bool = False, ) -> None: super().__init__() @@ -46,11 +49,14 @@ class HDynamicLayer(torch.nn.Module): self._output_size = output_size self._output_layer = bool(output_layer) self._local_learning = bool(local_learning) + self._force_forward_h_dynamic_on_cpu = force_forward_h_dynamic_on_cpu global_sbs_gpu_setting.append(torch.tensor([0])) global_sbs_size.append(torch.tensor([0, 0, 0, 0])) - if device == torch.device("cpu"): + if (device == torch.device("cpu")) or ( + self._force_forward_h_dynamic_on_cpu is True + ): global_sbs_hdynamic_cpp.append(HDynamicCNNCPU()) else: global_sbs_hdynamic_cpp.append(HDynamicCNNGPU()) @@ -146,11 +152,6 @@ class FunctionalSbS(torch.autograd.Function): number_of_spikes: int = int(spikes.shape[1]) - if input.device == torch.device("cpu"): - hdyn_number_of_cpu_processes: int = int(parameter_list[0]) - else: - hdyn_number_of_cpu_processes = -1 - output_size_0: int = int(parameter_list[1]) output_size_1: int = int(parameter_list[2]) gpu_tuning_factor: int = int(parameter_list[3]) @@ -158,6 +159,30 @@ class FunctionalSbS(torch.autograd.Function): sbs_gpu_setting_position = int(parameter_list[4]) sbs_hdynamic_cpp_position = int(parameter_list[5]) + if ( + isinstance( + global_sbs_hdynamic_cpp[sbs_hdynamic_cpp_position], HDynamicCNNCPU + ) + is True + ): + are_we_on_a_cpu: bool = True + work_device: torch.device = torch.device("cpu") + else: + are_we_on_a_cpu = False + work_device = input.device + + target_device: torch.device = input.device + + if target_device == work_device: + data_is_on_the_same_device: bool = True + else: + data_is_on_the_same_device = False + + if are_we_on_a_cpu is True: + hdyn_number_of_cpu_processes: int = int(parameter_list[0]) + else: + hdyn_number_of_cpu_processes = -1 + # ########################################################### # H dynamic # ########################################################### @@ -169,7 +194,7 @@ class FunctionalSbS(torch.autograd.Function): # Make space for the results # ############################################ - output = torch.empty( + output_work: torch.Tensor = torch.empty( ( int(input.shape[0]), int(weights.shape[1]), @@ -177,17 +202,43 @@ class FunctionalSbS(torch.autograd.Function): output_size_1, ), dtype=input.dtype, - device=input.device, + device=work_device, ) - assert output.is_contiguous() is True + assert output_work.is_contiguous() is True if epsilon_xy is not None: assert epsilon_xy.is_contiguous() is True assert epsilon_xy.ndim == 3 + if data_is_on_the_same_device is False: + epsilon_xy_work = epsilon_xy.to(work_device) + else: + epsilon_xy_work = epsilon_xy + else: + epsilon_xy_work = None + assert epsilon_t_0.is_contiguous() is True + if data_is_on_the_same_device is False: + epsilon_t_0_work = epsilon_t_0.to(work_device) + else: + epsilon_t_0_work = epsilon_t_0 + assert weights.is_contiguous() is True + if data_is_on_the_same_device is False: + weights_work = weights.to(work_device) + else: + weights_work = weights + assert spikes.is_contiguous() is True + if data_is_on_the_same_device is False: + spikes_work = spikes.to(work_device) + else: + spikes_work = spikes + assert h_initial.is_contiguous() is True + if data_is_on_the_same_device is False: + h_initial_work = h_initial.to(work_device) + else: + h_initial_work = h_initial assert weights.ndim == 2 assert h_initial.ndim == 1 @@ -196,32 +247,32 @@ class FunctionalSbS(torch.autograd.Function): sbs_size = global_sbs_size[sbs_gpu_setting_position].clone() - if input.device != torch.device("cpu"): + if are_we_on_a_cpu is False: if ( (sbs_profile.numel() == 1) - or (sbs_size[0] != int(output.shape[0])) - or (sbs_size[1] != int(output.shape[1])) - or (sbs_size[2] != int(output.shape[2])) - or (sbs_size[3] != int(output.shape[3])) + or (sbs_size[0] != int(output_work.shape[0])) + or (sbs_size[1] != int(output_work.shape[1])) + or (sbs_size[2] != int(output_work.shape[2])) + or (sbs_size[3] != int(output_work.shape[3])) ): sbs_profile = torch.zeros( (14, 7), dtype=torch.int64, device=torch.device("cpu") ) global_sbs_hdynamic_cpp[sbs_hdynamic_cpp_position].gpu_occupancy_export( - int(output.shape[2]), - int(output.shape[3]), - int(output.shape[0]), - int(output.shape[1]), + int(output_work.shape[2]), + int(output_work.shape[3]), + int(output_work.shape[0]), + int(output_work.shape[1]), sbs_profile.data_ptr(), int(sbs_profile.shape[0]), int(sbs_profile.shape[1]), ) global_sbs_gpu_setting[sbs_gpu_setting_position] = sbs_profile.clone() - sbs_size[0] = int(output.shape[0]) - sbs_size[1] = int(output.shape[1]) - sbs_size[2] = int(output.shape[2]) - sbs_size[3] = int(output.shape[3]) + sbs_size[0] = int(output_work.shape[0]) + sbs_size[1] = int(output_work.shape[1]) + sbs_size[2] = int(output_work.shape[2]) + sbs_size[3] = int(output_work.shape[3]) global_sbs_size[sbs_gpu_setting_position] = sbs_size.clone() else: @@ -232,32 +283,41 @@ class FunctionalSbS(torch.autograd.Function): ) global_sbs_hdynamic_cpp[sbs_hdynamic_cpp_position].update( - output.data_ptr(), - int(output.shape[0]), - int(output.shape[1]), - int(output.shape[2]), - int(output.shape[3]), - epsilon_xy.data_ptr() if epsilon_xy is not None else int(0), - int(epsilon_xy.shape[0]) if epsilon_xy is not None else int(0), - int(epsilon_xy.shape[1]) if epsilon_xy is not None else int(0), - int(epsilon_xy.shape[2]) if epsilon_xy is not None else int(0), - epsilon_t_0.data_ptr(), - int(epsilon_t_0.shape[0]), - weights.data_ptr(), - int(weights.shape[0]), - int(weights.shape[1]), - spikes.data_ptr(), - int(spikes.shape[0]), - int(spikes.shape[1]), - int(spikes.shape[2]), - int(spikes.shape[3]), - h_initial.data_ptr(), - int(h_initial.shape[0]), + output_work.data_ptr(), + int(output_work.shape[0]), + int(output_work.shape[1]), + int(output_work.shape[2]), + int(output_work.shape[3]), + epsilon_xy_work.data_ptr() if epsilon_xy_work is not None else int(0), + int(epsilon_xy_work.shape[0]) if epsilon_xy_work is not None else int(0), + int(epsilon_xy_work.shape[1]) if epsilon_xy_work is not None else int(0), + int(epsilon_xy_work.shape[2]) if epsilon_xy_work is not None else int(0), + epsilon_t_0_work.data_ptr(), + int(epsilon_t_0_work.shape[0]), + weights_work.data_ptr(), + int(weights_work.shape[0]), + int(weights_work.shape[1]), + spikes_work.data_ptr(), + int(spikes_work.shape[0]), + int(spikes_work.shape[1]), + int(spikes_work.shape[2]), + int(spikes_work.shape[3]), + h_initial_work.data_ptr(), + int(h_initial_work.shape[0]), hdyn_number_of_cpu_processes, float(forgetting_offset.cpu().item()), int(gpu_tuning_factor), ) + if data_is_on_the_same_device is False: + output = output_work.to(target_device) + else: + output = output_work + + # print(output) + # print(output.sum(dim=1)) + # print(output.sum(dim=1).shape) + # exit() # ########################################################### # Save the necessary data for the backward pass # ########################################################### diff --git a/network/Parameter.py b/network/Parameter.py index b9e3924..8847b4e 100644 --- a/network/Parameter.py +++ b/network/Parameter.py @@ -142,6 +142,9 @@ class Config: epsilon_0: float = field(default=1.0) forgetting_offset: float = field(default=-1.0) + force_forward_h_dynamic_on_cpu: bool = field(default=True) + spike_full_layer_input_distribution: list[bool] = field(default_factory=list) + def __post_init__(self) -> None: """Post init determines the number of cores. Creates the required directory and gives us an optimized diff --git a/network/SbSLayer.py b/network/SbSLayer.py index c4db2a8..ae593a6 100644 --- a/network/SbSLayer.py +++ b/network/SbSLayer.py @@ -52,7 +52,9 @@ class SbSLayer(torch.nn.Module): _reduction_cooldown: float = 1.0 _layer_id: int = -1 - spike_full_layer_input_distribution: bool = False + _spike_full_layer_input_distribution: bool + + _force_forward_h_dynamic_on_cpu: bool def __init__( self, @@ -81,6 +83,10 @@ class SbSLayer(torch.nn.Module): layer_id: int = -1, cooldown_after_number_of_spikes: int = -1, reduction_cooldown: float = 1.0, + force_forward_h_dynamic_on_cpu: bool = True, + spike_full_layer_input_distribution: bool = False, + force_forward_spike_on_cpu: bool = False, + force_forward_spike_output_on_cpu: bool = False, ) -> None: super().__init__() @@ -109,6 +115,8 @@ class SbSLayer(torch.nn.Module): self.reduction_cooldown = float(reduction_cooldown) self._layer_id = layer_id self._epsilon_xy_use = epsilon_xy_use + self._force_forward_h_dynamic_on_cpu = force_forward_h_dynamic_on_cpu + self._spike_full_layer_input_distribution = spike_full_layer_input_distribution assert len(input_size) == 2 self._input_size = input_size @@ -140,6 +148,8 @@ class SbSLayer(torch.nn.Module): number_of_spikes=self._number_of_spikes, number_of_cpu_processes=self._number_of_cpu_processes, device=self.device, + force_forward_spike_on_cpu=force_forward_spike_on_cpu, + force_forward_spike_output_on_cpu=force_forward_spike_output_on_cpu, ) self.h_dynamic = HDynamicLayer( @@ -152,6 +162,7 @@ class SbSLayer(torch.nn.Module): device=device, default_dtype=self.default_dtype, gpu_tuning_factor=gpu_tuning_factor, + force_forward_h_dynamic_on_cpu=self._force_forward_h_dynamic_on_cpu, ) assert len(input_size) >= 2 @@ -169,10 +180,6 @@ class SbSLayer(torch.nn.Module): number_of_cpu_processes=number_of_cpu_processes, ) - # TODO: TEST - if layer_id == 0: - self.spike_full_layer_input_distribution = True - # ############################################################### # Initialize the weights # ############################################################### @@ -438,7 +445,7 @@ class SbSLayer(torch.nn.Module): else: assert self._epsilon_xy is None - if self.spike_full_layer_input_distribution is False: + if self._spike_full_layer_input_distribution is False: spike = self.spike_generator(input_convolved, int(self._number_of_spikes)) else: input_shape = input.shape @@ -457,7 +464,9 @@ class SbSLayer(torch.nn.Module): (input_shape[0], input_shape[1], input_shape[2], input_shape[3]) ) ) - spike = self.spikes_sorter(spike_unsorted).to(device=input_convolved.device) + spike = self.spikes_sorter(spike_unsorted) + if self._force_forward_h_dynamic_on_cpu is False: + spike = spike.to(device=input_convolved.device) output = self.h_dynamic( input=input_convolved, diff --git a/network/SpikeLayer.py b/network/SpikeLayer.py index 25a86d6..82b7d42 100644 --- a/network/SpikeLayer.py +++ b/network/SpikeLayer.py @@ -15,12 +15,16 @@ class SpikeLayer(torch.nn.Module): _number_of_cpu_processes: int _number_of_spikes: int device: torch.device + _force_forward_spike_on_cpu: bool + _force_forward_spike_output_on_cpu: bool def __init__( self, number_of_spikes: int = -1, number_of_cpu_processes: int = 1, device: torch.device | None = None, + force_forward_spike_on_cpu: bool = False, + force_forward_spike_output_on_cpu: bool = False, ) -> None: super().__init__() @@ -29,11 +33,15 @@ class SpikeLayer(torch.nn.Module): self._number_of_cpu_processes = number_of_cpu_processes self._number_of_spikes = number_of_spikes + self._force_forward_spike_on_cpu = force_forward_spike_on_cpu + self._force_forward_spike_output_on_cpu = force_forward_spike_output_on_cpu global_spike_generation_gpu_setting.append(torch.tensor([0])) global_spike_size.append(torch.tensor([0, 0, 0, 0])) - if device == torch.device("cpu"): + if (device == torch.device("cpu")) or ( + self._force_forward_spike_on_cpu is True + ): global_spike_generation_cpp.append(SpikeGenerationCPU()) else: global_spike_generation_cpp.append(SpikeGenerationGPU()) @@ -66,6 +74,7 @@ class SpikeLayer(torch.nn.Module): int(self._spike_generation_cpp_position), # 1 int(self._spike_generation_gpu_setting_position), # 2 int(number_of_spikes), # 3 + int(self._force_forward_spike_output_on_cpu), # 4 ], dtype=torch.int64, ) @@ -83,14 +92,35 @@ class FunctionalSpikeGeneration(torch.autograd.Function): assert input.dim() == 4 - if input.device == torch.device("cpu"): - spike_number_of_cpu_processes: int = int(parameter_list[0]) - else: - spike_number_of_cpu_processes = -1 - spike_generation_cpp_position = int(parameter_list[1]) spike_generation_gpu_setting_position = int(parameter_list[2]) number_of_spikes: int = int(parameter_list[3]) + force_forward_spike_output_on_cpu: bool = bool(parameter_list[4]) + + if ( + isinstance( + global_spike_generation_cpp[spike_generation_cpp_position], + SpikeGenerationCPU, + ) + is True + ): + are_we_on_a_cpu: bool = True + work_device: torch.device = torch.device("cpu") + else: + are_we_on_a_cpu = False + work_device = input.device + + target_device: torch.device = input.device + + if target_device == work_device: + data_is_on_the_same_device: bool = True + else: + data_is_on_the_same_device = False + + if are_we_on_a_cpu is True: + spike_number_of_cpu_processes: int = int(parameter_list[0]) + else: + spike_number_of_cpu_processes = -1 # ########################################################### # Spike generation @@ -100,7 +130,12 @@ class FunctionalSpikeGeneration(torch.autograd.Function): # Normalized cumsum # (beware of the pytorch bug! Thus .clone()!) # ############################################ - input_cumsum: torch.Tensor = torch.cumsum(input, dim=1, dtype=input.dtype) + if data_is_on_the_same_device is False: + input_work = input.to(work_device) + else: + input_work = input + # input_work = input + input_cumsum: torch.Tensor = torch.cumsum(input_work, dim=1, dtype=input.dtype) input_cumsum_last: torch.Tensor = input_cumsum[:, -1, :, :].unsqueeze(1).clone() input_cumsum /= input_cumsum_last @@ -115,17 +150,19 @@ class FunctionalSpikeGeneration(torch.autograd.Function): input_cumsum.shape[3], ], dtype=input.dtype, - device=input.device, + device=work_device, ) # ############################################ # Make space for the results # ############################################ - spikes = torch.empty_like(random_values, dtype=torch.int64, device=input.device) + spikes_work = torch.empty_like( + random_values, dtype=torch.int64, device=work_device + ) assert input_cumsum.is_contiguous() is True assert random_values.is_contiguous() is True - assert spikes.is_contiguous() is True + assert spikes_work.is_contiguous() is True # time_start: float = time.perf_counter() spike_generation_profile = global_spike_generation_gpu_setting[ @@ -136,19 +173,13 @@ class FunctionalSpikeGeneration(torch.autograd.Function): spike_generation_gpu_setting_position ].clone() - if ( - isinstance( - global_spike_generation_cpp[spike_generation_cpp_position], - SpikeGenerationGPU, - ) - is True - ): + if are_we_on_a_cpu is False: if ( (spike_generation_profile.numel() == 1) - or (spike_generation_size[0] != int(spikes.shape[0])) - or (spike_generation_size[1] != int(spikes.shape[1])) - or (spike_generation_size[2] != int(spikes.shape[2])) - or (spike_generation_size[3] != int(spikes.shape[3])) + or (spike_generation_size[0] != int(spikes_work.shape[0])) + or (spike_generation_size[1] != int(spikes_work.shape[1])) + or (spike_generation_size[2] != int(spikes_work.shape[2])) + or (spike_generation_size[3] != int(spikes_work.shape[3])) ): spike_generation_profile = torch.zeros( @@ -157,10 +188,10 @@ class FunctionalSpikeGeneration(torch.autograd.Function): global_spike_generation_cpp[ spike_generation_cpp_position ].gpu_occupancy_export( - int(spikes.shape[2]), - int(spikes.shape[3]), - int(spikes.shape[0]), - int(spikes.shape[1]), + int(spikes_work.shape[2]), + int(spikes_work.shape[3]), + int(spikes_work.shape[0]), + int(spikes_work.shape[1]), spike_generation_profile.data_ptr(), int(spike_generation_profile.shape[0]), int(spike_generation_profile.shape[1]), @@ -169,10 +200,10 @@ class FunctionalSpikeGeneration(torch.autograd.Function): spike_generation_gpu_setting_position ] = spike_generation_profile.clone() - spike_generation_size[0] = int(spikes.shape[0]) - spike_generation_size[1] = int(spikes.shape[1]) - spike_generation_size[2] = int(spikes.shape[2]) - spike_generation_size[3] = int(spikes.shape[3]) + spike_generation_size[0] = int(spikes_work.shape[0]) + spike_generation_size[1] = int(spikes_work.shape[1]) + spike_generation_size[2] = int(spikes_work.shape[2]) + spike_generation_size[3] = int(spikes_work.shape[3]) global_spike_size[ spike_generation_gpu_setting_position ] = spike_generation_size.clone() @@ -197,15 +228,20 @@ class FunctionalSpikeGeneration(torch.autograd.Function): int(random_values.shape[1]), int(random_values.shape[2]), int(random_values.shape[3]), - spikes.data_ptr(), - int(spikes.shape[0]), - int(spikes.shape[1]), - int(spikes.shape[2]), - int(spikes.shape[3]), + spikes_work.data_ptr(), + int(spikes_work.shape[0]), + int(spikes_work.shape[1]), + int(spikes_work.shape[2]), + int(spikes_work.shape[3]), int(spike_number_of_cpu_processes), ) - del random_values - del input_cumsum + + if (force_forward_spike_output_on_cpu is True) and (are_we_on_a_cpu is True): + spikes = spikes_work + elif data_is_on_the_same_device is False: + spikes = spikes_work.to(target_device) + else: + spikes = spikes_work return spikes diff --git a/network/build_network.py b/network/build_network.py index 532116e..ed95395 100644 --- a/network/build_network.py +++ b/network/build_network.py @@ -120,6 +120,12 @@ def build_network( cfg.learning_parameters.sbs_skip_gradient_calculation[0] ) + spike_full_layer_input_distribution: bool = False + if len(cfg.spike_full_layer_input_distribution) > layer_id: + spike_full_layer_input_distribution = ( + cfg.spike_full_layer_input_distribution[layer_id] + ) + # ############################################################# # SbS layer: # ############################################################# @@ -138,7 +144,10 @@ def build_network( assert number_of_spikes > 0 logging.info( - f"Layer: {layer_id} -> SbS Layer with {number_of_spikes} spikes" + ( + f"Layer: {layer_id} -> SbS Layer with {number_of_spikes} spikes " + f"-- draw spike from full layer: {spike_full_layer_input_distribution}" + ) ) is_pooling_layer: bool = False if cfg.network_structure.layer_type[layer_id].upper().find("POOLING") != -1: @@ -169,6 +178,8 @@ def build_network( layer_id=layer_id, cooldown_after_number_of_spikes=cfg.cooldown_after_number_of_spikes, reduction_cooldown=cfg.reduction_cooldown, + force_forward_h_dynamic_on_cpu=cfg.force_forward_h_dynamic_on_cpu, + spike_full_layer_input_distribution=spike_full_layer_input_distribution, ) ) # Adding the x,y output dimensions