Add files via upload

This commit is contained in:
David Rotermund 2023-01-05 13:23:58 +01:00 committed by GitHub
parent 4426f05e61
commit 81ec320f5c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
77 changed files with 8266 additions and 0 deletions

View file

@ -0,0 +1,161 @@
# MIT License
# Copyright 2022 University of Bremen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# David Rotermund ( davrot@uni-bremen.de )
#
#
# Release history:
# ================
# 1.0.0 -- 01.05.2022: first release
#
#
import numpy as np
# [offset] [type] [value] [description]
# 0000 32 bit integer 0x00000801(2049) magic number (MSB first)
# 0004 32 bit integer 60000 number of items
# 0008 unsigned byte ?? label
# 0009 unsigned byte ?? label
# ........
# xxxx unsigned byte ?? label
# The labels values are 0 to 9.
class ReadLabel:
"""Class for reading the labels from an MNIST label file"""
def __init__(self, filename):
self.filename: str = filename
self.data = self.read_from_file(filename)
def read_from_file(self, filename):
int32_data = np.dtype(np.uint32)
int32_data = int32_data.newbyteorder(">")
file = open(filename, "rb")
magic_flag = np.frombuffer(file.read(4), int32_data)[0]
if magic_flag != 2049:
data = np.zeros(0)
number_of_elements = 0
else:
number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
if number_of_elements < 1:
data = np.zeros(0)
else:
data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8)
file.close()
return data
# [offset] [type] [value] [description]
# 0000 32 bit integer 0x00000803(2051) magic number
# 0004 32 bit integer 60000 number of images
# 0008 32 bit integer 28 number of rows
# 0012 32 bit integer 28 number of columns
# 0016 unsigned byte ?? pixel
# 0017 unsigned byte ?? pixel
# ........
# xxxx unsigned byte ?? pixel
# Pixels are organized row-wise.
# Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
class ReadPicture:
"""Class for reading the images from an MNIST image file"""
def __init__(self, filename):
self.filename: str = filename
self.data = self.read_from_file(filename)
def read_from_file(self, filename):
int32_data = np.dtype(np.uint32)
int32_data = int32_data.newbyteorder(">")
file = open(filename, "rb")
magic_flag = np.frombuffer(file.read(4), int32_data)[0]
if magic_flag != 2051:
data = np.zeros(0)
number_of_elements = 0
else:
number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
if number_of_elements < 1:
data = np.zeros(0)
number_of_rows = 0
else:
number_of_rows = np.frombuffer(file.read(4), int32_data)[0]
if number_of_rows != 28:
data = np.zeros(0)
number_of_columns = 0
else:
number_of_columns = np.frombuffer(file.read(4), int32_data)[0]
if number_of_columns != 28:
data = np.zeros(0)
else:
data = np.frombuffer(
file.read(number_of_elements * number_of_rows * number_of_columns),
dtype=np.uint8,
)
data = data.reshape(number_of_elements, number_of_columns, number_of_rows)
file.close()
return data
def proprocess_data_set(test_mode):
if test_mode is True:
filename_out_pattern: str = "TestPatternStorage.npy"
filename_out_label: str = "TestLabelStorage.npy"
filename_in_image: str = "t10k-images-idx3-ubyte"
filename_in_label = "t10k-labels-idx1-ubyte"
else:
filename_out_pattern = "TrainPatternStorage.npy"
filename_out_label = "TrainLabelStorage.npy"
filename_in_image = "train-images-idx3-ubyte"
filename_in_label = "train-labels-idx1-ubyte"
pictures = ReadPicture(filename_in_image)
labels = ReadLabel(filename_in_label)
# Down to 0 ... 1.0
max_value = np.max(pictures.data.astype(np.float32))
d = np.float32(pictures.data.astype(np.float32) / max_value)
label_storage = np.uint64(labels.data)
pattern_storage = d.astype(np.float32)
np.save(filename_out_pattern, pattern_storage)
np.save(filename_out_label, label_storage)
proprocess_data_set(True)
proprocess_data_set(False)

View file

@ -0,0 +1,8 @@
https://github.com/zalandoresearch/fashion-mnist
We need:
t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz
Then
gzip -d *.gz
python convert.py

10
clean.sh Normal file
View file

@ -0,0 +1,10 @@
read -p "Are you sure? " -n 1 -r
echo # (optional) move to a new line
if [[ ! $REPLY =~ ^[Yy]$ ]]
then
exit 1
fi
rm -rf Log
rm -rf Parameters
rm *.txt

4
dataset.json Normal file
View file

@ -0,0 +1,4 @@
{
"data_path": "./DATA_FASHION_MNIST/",
"data_mode": "MNIST_FASHION"
}

24
def.json Normal file
View file

@ -0,0 +1,24 @@
{
"epoch_id_max": 200,
"number_of_spikes": [
1600
],
"batch_size": 24,
"forgetting_offset": 0.0,
"stage_id": 0,
"simulation_id": 0,
"learning_parameters": {
"learning_rate_threshold_w": 0.001,
"eps_xy_intitial": 1.0,
"number_of_batches_for_one_update": 20,
"learning_rate_gamma_w": 0.001,
"lr_scheduler_patience_w": 50,
"adapt_learning_rate_after_minibatch": true,
"w_trainable": [
true
]
},
"augmentation": {},
"image_statistics": {},
"approximation_setting": {}
}

24
def_sbs_L0.json Normal file
View file

@ -0,0 +1,24 @@
{
"epoch_id_max": 200,
"number_of_spikes": [
1600
],
"batch_size": 24,
"forgetting_offset": 0.0,
"stage_id": 0,
"simulation_id": 0,
"learning_parameters": {
"learning_rate_threshold_w": 0.001,
"eps_xy_intitial": 1.0,
"number_of_batches_for_one_update": 20,
"learning_rate_gamma_w": 1.0,
"lr_scheduler_patience_w": 50,
"adapt_learning_rate_after_minibatch": true,
"w_trainable": [
true
]
},
"augmentation": {},
"image_statistics": {},
"approximation_setting": {}
}

47
get_perf.py Normal file
View file

@ -0,0 +1,47 @@
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator
import numpy as np
import json
from jsmin import jsmin
import glob
# -------------------------------
filename:str = "def.json"
with open(filename) as json_file:
minified = jsmin(json_file.read())
data = json.loads(minified)
number_of_spikes = data["number_of_spikes"]
# -------------------------------
path_runs: str = "./Log/*"
temp = glob.glob(path_runs)
assert len(temp) == 1
path = temp[0]
acc = event_accumulator.EventAccumulator(path)
acc.Reload()
available_scalar = acc.Tags()["scalars"]
available_histograms = acc.Tags()["histograms"]
which_scalar = "Test Error"
te = acc.Scalars(which_scalar)
temp = []
for te_item in te:
temp.append((te_item[1], te_item[2]))
temp = np.array(temp)
print(temp)
np.save(f"test_error_{number_of_spikes}.npy", temp)

201
learn_it.py Normal file
View file

@ -0,0 +1,201 @@
# %%
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import sys
import torch
import dataconf
import logging
from datetime import datetime
from network.Parameter import Config
from network.build_network import build_network
from network.build_optimizer import build_optimizer
from network.build_lr_scheduler import build_lr_scheduler
from network.build_datasets import build_datasets
from network.load_previous_weights import load_previous_weights
from network.loop_train_test import loop_test, loop_train, run_lr_scheduler
from torch.utils.tensorboard import SummaryWriter
# ######################################################################
# We want to log what is going on into a file and screen
# ######################################################################
now = datetime.now()
dt_string_filename = now.strftime("%Y_%m_%d_%H_%M_%S")
logging.basicConfig(
filename="log_" + dt_string_filename + ".txt",
filemode="w",
level=logging.INFO,
format="%(asctime)s %(message)s",
)
logging.getLogger().addHandler(logging.StreamHandler())
# ######################################################################
# Load the config data from the json file
# ######################################################################
if len(sys.argv) < 2:
raise Exception("Argument: Config file name is missing")
filename: str = sys.argv[1]
if os.path.exists(filename) is False:
raise Exception(f"Config file not found! {filename}")
if os.path.exists("network.json") is False:
raise Exception("Config file not found! network.json")
if os.path.exists("dataset.json") is False:
raise Exception("Config file not found! dataset.json")
cfg = dataconf.multi.file("network.json").file("dataset.json").file(filename).on(Config)
logging.info(cfg)
logging.info(f"Using configuration file: {filename}")
logging.info(f"Number of spikes: {cfg.number_of_spikes}")
logging.info(f"Cooldown after spikes: {cfg.cooldown_after_number_of_spikes}")
logging.info(f"Reduction cooldown: {cfg.reduction_cooldown}")
logging.info("")
logging.info(f"Epsilon 0: {cfg.epsilon_0}")
logging.info(f"Batch size: {cfg.batch_size}")
logging.info(f"Data mode: {cfg.data_mode}")
logging.info("")
logging.info("*** Config loaded.")
logging.info("")
tb = SummaryWriter(log_dir=cfg.log_path)
# ###########################################
# GPU Yes / NO ?
# ###########################################
default_dtype = torch.float32
torch.set_default_dtype(default_dtype)
torch_device: str = "cuda:0" if torch.cuda.is_available() else "cpu"
use_gpu: bool = True if torch.cuda.is_available() else False
print(f"Using {torch_device} device")
device = torch.device(torch_device)
# ######################################################################
# Prepare the test and training data
# ######################################################################
the_dataset_train, the_dataset_test, my_loader_test, my_loader_train = build_datasets(
cfg
)
logging.info("*** Data loaded.")
# ######################################################################
# Build the network, Optimizer, and LR Scheduler #
# ######################################################################
network = build_network(
cfg=cfg, device=device, default_dtype=default_dtype, logging=logging
)
logging.info("")
optimizer = build_optimizer(network=network, cfg=cfg, logging=logging)
lr_scheduler = build_lr_scheduler(optimizer=optimizer, cfg=cfg, logging=logging)
logging.info("*** Network generated.")
load_previous_weights(
network=network,
overload_path=cfg.learning_parameters.overload_path,
logging=logging,
device=device,
default_dtype=default_dtype,
)
logging.info("")
last_test_performance: float = -1.0
with torch.no_grad():
if cfg.learning_parameters.learning_active is True:
while cfg.epoch_id < cfg.epoch_id_max:
# ##############################################
# Run a training data epoch
# ##############################################
network.train()
(
my_loss_for_batch,
performance_for_batch,
full_loss,
full_correct,
) = loop_train(
cfg=cfg,
network=network,
my_loader_train=my_loader_train,
the_dataset_train=the_dataset_train,
optimizer=optimizer,
device=device,
default_dtype=default_dtype,
logging=logging,
tb=tb,
adapt_learning_rate=cfg.learning_parameters.adapt_learning_rate_after_minibatch,
lr_scheduler=lr_scheduler,
last_test_performance=last_test_performance,
)
# Let the torch learning rate scheduler update the
# learning rates of the optimiers
if cfg.learning_parameters.adapt_learning_rate_after_minibatch is False:
run_lr_scheduler(
cfg=cfg,
lr_scheduler=lr_scheduler,
optimizer=optimizer,
performance_for_batch=performance_for_batch,
my_loss_for_batch=my_loss_for_batch,
tb=tb,
logging=logging,
)
# ##############################################
# Run test data
# ##############################################
network.eval()
last_test_performance = loop_test(
epoch_id=cfg.epoch_id,
cfg=cfg,
network=network,
my_loader_test=my_loader_test,
the_dataset_test=the_dataset_test,
device=device,
default_dtype=default_dtype,
logging=logging,
tb=tb,
)
# Next epoch
cfg.epoch_id += 1
else:
# ##############################################
# Run test data
# ##############################################
network.eval()
last_test_performance = loop_test(
epoch_id=cfg.epoch_id,
cfg=cfg,
network=network,
my_loader_test=my_loader_test,
the_dataset_test=the_dataset_test,
device=device,
default_dtype=default_dtype,
logging=logging,
tb=tb,
)
tb.close()
# %%

5
make_new_previous.sh Normal file
View file

@ -0,0 +1,5 @@
MAX_NUMBER=199
MyPATH="Previous++"
mkdir $MyPATH
cp Parameters/*_S$MAX_NUMBER.npy $MyPATH

91
network.json Normal file
View file

@ -0,0 +1,91 @@
{
"network_structure": {
"number_of_output_neurons": 10,
"layer_type": [
"SbS",
"MAX POOLING",
"SbS",
"MAX POOLING",
"SbS",
"SbS"
],
"strides": [
[
1,
1
], // "SbS"
[
2,
2
], // POOLING
[
1,
1
], // "SbS"
[
2,
2
], // POOLING
[
1,
1
], // "SbS"
[
1,
1
] // "SbS"
],
"forward_neuron_numbers": [
[
1,
32
], // "SbS"
[
32,
32
], // POOLING
[
32,
64
], // "SbS"
[
64,
64
], // POOLING
[
64,
96
], // "SbS"
[
96,
10
] // "SbS"
],
"forward_kernel_size": [
[
5,
5
], // "SbS"
[
2,
2
], // POOLING
[
5,
5
], // "SbS"
[
2,
2
], // POOLING
[
3,
3
], // "SbS"
[
1,
1
] // "SbS"
]
}
}

154
network/Adam.py Normal file
View file

@ -0,0 +1,154 @@
import torch
import math
class Adam(torch.optim.Optimizer):
sbs_setting: list[bool]
lr: float
beta1: float
beta2: float
eps: float
maximize: bool
def __init__(
self,
params,
sbs_setting: list[bool],
lr: float = 1e-3,
beta1: float = 0.9,
beta2: float = 0.999,
eps: float = 1e-8,
maximize: bool = False,
) -> None:
assert lr > 0.0
assert eps > 0.0
assert beta1 > 0.0
assert beta1 < 1.0
assert beta2 > 0.0
assert beta2 < 1.0
assert len(sbs_setting) == len(params)
self.sbs_setting = sbs_setting
self.params = params
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.eps = eps
self.maximize = maximize
defaults = dict(
lr=lr,
beta1=beta1,
beta2=beta2,
eps=eps,
maximize=maximize,
)
super().__init__(params, defaults)
def step(self):
params_with_grad = []
grads = []
exp_avgs = []
exp_avg_sqs = []
state_steps = []
sbs_setting = []
for id, p in enumerate(self.params):
if p.grad is not None:
params_with_grad.append(p)
grads.append(p.grad)
sbs_setting.append(self.sbs_setting[id])
state = self.state[p]
# Lazy state initialization
if len(state) == 0:
state["step"] = torch.tensor(0.0)
# Exponential moving average of gradient values
state["exp_avg"] = torch.zeros_like(
p, memory_format=torch.preserve_format
)
# Exponential moving average of squared gradient values
state["exp_avg_sq"] = torch.zeros_like(
p, memory_format=torch.preserve_format
)
exp_avgs.append(state["exp_avg"])
exp_avg_sqs.append(state["exp_avg_sq"])
state_steps.append(state["step"])
self.adam(
params_with_grad,
grads,
sbs_setting,
exp_avgs,
exp_avg_sqs,
state_steps,
beta1=self.beta1,
beta2=self.beta2,
lr=self.lr,
eps=self.eps,
maximize=self.maximize,
)
def adam(
self,
params: list[torch.Tensor],
grads: list[torch.Tensor],
sbs_setting: list[bool],
exp_avgs: list[torch.Tensor],
exp_avg_sqs: list[torch.Tensor],
state_steps: list[torch.Tensor],
beta1: float,
beta2: float,
lr: float,
eps: float,
maximize: bool,
) -> None:
with torch.no_grad():
for i, param in enumerate(params):
if maximize is False:
grad = grads[i]
else:
grad = -grads[i]
exp_avg = exp_avgs[i]
exp_avg_sq = exp_avg_sqs[i]
step_t = state_steps[i]
# increase step
step_t += 1
# Decay the first and second moment running average coefficient
exp_avg *= beta1
exp_avg += (1.0 - beta1) * grad
exp_avg_sq *= beta2
exp_avg_sq += (1.0 - beta2) * grad**2
step_size: float = lr / (1.0 - beta1 ** float(step_t))
denom = (
exp_avg_sq.sqrt() / math.sqrt(1.0 - beta2 ** float(step_t))
) + eps
if sbs_setting[i] is False:
param -= step_size * (exp_avg / denom)
else:
delta = torch.exp(-step_size * (exp_avg / denom))
print(
f"{float(delta.min()) - 1.0:.4e} {float(delta.max()) - 1.0:.4e}"
)
param *= delta

View file

@ -0,0 +1,714 @@
#include "HDynamicCNNManyIP.h"
#include <omp.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <cassert>
#include <iostream>
HDynamicCNNManyIP::HDynamicCNNManyIP()
{
};
HDynamicCNNManyIP::~HDynamicCNNManyIP()
{
};
bool HDynamicCNNManyIP::update_entrypoint(
int64_t h_pointer_addr,
int64_t h_dim_0,
int64_t h_dim_1,
int64_t h_dim_2,
int64_t h_dim_3,
int64_t epsilon_xy_pointer_addr,
int64_t epsilon_xy_dim_0,
int64_t epsilon_xy_dim_1,
int64_t epsilon_xy_dim_2,
int64_t epsilon_t_pointer_addr,
int64_t epsilon_t_dim_0,
int64_t weights_pointer_addr,
int64_t weights_dim_0,
int64_t weights_dim_1,
int64_t input_pointer_addr,
int64_t input_dim_0,
int64_t input_dim_1,
int64_t input_dim_2,
int64_t input_dim_3,
int64_t init_vector_pointer_addr,
int64_t init_vector_dim_0,
int64_t number_of_processes,
float forgetting_offset,
int64_t gpu_tuning_factor)
{
size_t number_of_pattern = input_dim_0;
size_t h_dim = init_vector_dim_0;
float* h_init_ptr = (float*)init_vector_pointer_addr;
assert((h_init_ptr != nullptr));
assert((h_dim > 0));
float* h_pointer = (float*)h_pointer_addr;
assert((h_pointer != nullptr));
assert((h_dim_0 > 0));
assert((h_dim_1 > 0));
assert((h_dim_2 > 0));
assert((h_dim_3 > 0));
size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3;
size_t h_dim_c1 = h_dim_2 * h_dim_3;
size_t h_dim_c2 = h_dim_3;
float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr;
assert((epsilon_xy_pointer != nullptr));
assert((epsilon_xy_dim_0 > 0));
assert((epsilon_xy_dim_1 > 0));
size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1;
size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2;
float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr;
assert((epsilon_t_pointer != nullptr));
assert((epsilon_t_dim_0 > 0));
float* weights_pointer = (float*)weights_pointer_addr;
assert((weights_pointer != nullptr));
assert((weights_dim_0 > 0));
assert((weights_dim_1 > 0));
size_t weights_dim_c0 = weights_dim_1;
int64_t* input_pointer = (int64_t*)input_pointer_addr;
assert((input_pointer != nullptr));
assert((input_dim_0 > 0));
assert((input_dim_1 > 0));
assert((input_dim_2 > 0));
assert((input_dim_3 > 0));
size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
size_t input_dim_c1 = input_dim_2 * input_dim_3;
size_t input_dim_c2 = input_dim_3;
assert((h_dim == weights_dim_1));
size_t number_of_spikes = input_dim_1;
size_t dim_x = input_dim_2;
size_t dim_y = input_dim_3;
float forgetting_offset_local = forgetting_offset / static_cast<float>(h_dim);
// --------------------
if (number_of_processes > 0)
{
omp_set_num_threads(number_of_processes);
size_t pattern_id;
#pragma omp parallel for
for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
{
update(
h_init_ptr,
h_pointer,
h_dim_c0,
h_dim_c1,
h_dim_c2,
h_dim,
epsilon_xy_pointer,
epsilon_xy_dim_c0,
epsilon_xy_dim_c1,
epsilon_t_pointer,
weights_pointer,
weights_dim_c0,
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
number_of_spikes,
dim_x,
dim_y,
forgetting_offset,
forgetting_offset_local,
pattern_id);
}
}
else
{
gpu_update(
h_init_ptr,
h_pointer,
h_dim_c0,
h_dim_c1,
h_dim_c2,
h_dim,
epsilon_xy_pointer,
epsilon_xy_dim_c0,
epsilon_xy_dim_c1,
epsilon_t_pointer,
weights_pointer,
weights_dim_c0,
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
number_of_spikes,
dim_x,
dim_y,
forgetting_offset,
forgetting_offset_local,
number_of_pattern,
gpu_tuning_factor);
}
return true;
};
bool HDynamicCNNManyIP::update(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* epsilon_t_pointer,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
size_t number_of_spikes,
size_t dim_x,
size_t dim_y,
float forgetting_offset,
float forgetting_offset_local,
size_t pattern_id)
{
float* h_ptr;
float* epsilon_xy_ptr;
int64_t* input_ptr;
size_t counter_x;
size_t counter_y;
for (counter_x = 0; counter_x < dim_x; counter_x++)
{
for (counter_y = 0; counter_y < dim_y; counter_y++)
{
epsilon_xy_ptr = epsilon_xy_pointer +
counter_x * epsilon_xy_dim_c1 + counter_y;
h_ptr = h_pointer +
pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y;
input_ptr = input_pointer +
pattern_id * input_dim_c0 + counter_x * input_dim_c2 + counter_y;
update_one_ip(
h_init_ptr,
h_ptr,
h_dim_c1,
h_dim,
weights_pointer,
weights_dim_c0,
input_ptr,
input_dim_c1,
epsilon_xy_ptr,
epsilon_xy_dim_c0,
epsilon_t_pointer,
number_of_spikes,
forgetting_offset,
forgetting_offset_local);
}
}
return true;
};
void HDynamicCNNManyIP::update_one_ip(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c1,
size_t h_dim,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c1,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
float* epsilon_t_pointer,
size_t number_of_spikes,
float forgetting_offset,
float forgetting_offset_local)
{
float* h_temp = new float[h_dim];
float* h_subsegment = new float[h_dim];
memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim);
size_t counter_spike;
size_t counter;
float h_temp_sum;
float temp_value;
float epsilon_subsegment;
float epsilon_scale = 1.0;
int64_t* spike;
float* w_ptr;
for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++)
{
if (epsilon_scale > 1E10)
{
temp_value = 1.0 / epsilon_scale;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] *= temp_value;
}
epsilon_scale = 1.0;
}
spike = input_pointer + counter_spike * input_dim_c1;
if (*spike >= 0)
{
epsilon_subsegment =
epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike];
w_ptr = weights_pointer + *spike * weights_dim_c0;
memcpy(h_temp, h_subsegment, sizeof(float) * h_dim);
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_temp[counter] *= w_ptr[counter];
}
h_temp_sum = 0.0;
#pragma omp simd reduction(+ : h_temp_sum)
for (counter = 0; counter < h_dim; counter++)
{
h_temp_sum += h_temp[counter];
}
if (h_temp_sum > 1E-10)
{
temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_temp[counter] *= temp_value;
}
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] += h_temp[counter];
}
if (forgetting_offset_local > 0.0)
{
temp_value =
epsilon_scale * epsilon_subsegment * forgetting_offset_local;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] += temp_value;
}
epsilon_scale *=
1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
}
else
{
epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
}
}
}
}
temp_value = 1.0 / epsilon_scale;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_pointer[counter * h_dim_c1] =
h_subsegment[counter] * temp_value;
}
delete[] h_temp;
delete[] h_subsegment;
return;
};
__device__ void gpu_update_one_ip(
float* __restrict__ h_init_ptr,
float* __restrict__ h_pointer,
size_t h_dim_c1,
size_t h_dim,
float* __restrict__ weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c1,
float* __restrict__ epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
float* __restrict__ epsilon_t_pointer,
size_t number_of_spikes,
float forgetting_offset,
float forgetting_offset_local,
float* __restrict__ h_temp,
float* __restrict__ h_subsegment
)
{
size_t counter_spike;
size_t counter;
float h_temp_sum;
float temp_value;
float epsilon_subsegment;
float epsilon_scale = 1.0;
int64_t* spike;
float* w_ptr;
// float* h_temp = new float[h_dim];
// float* h_subsegment = new float[h_dim];
// Initialize the sub-segement
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] = h_init_ptr[counter];
}
for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++)
{
if (epsilon_scale > 1E10)
{
temp_value = 1.0 / epsilon_scale;
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] *= temp_value;
}
epsilon_scale = 1.0;
}
spike = input_pointer + counter_spike * input_dim_c1;
if (*spike >= 0)
{
epsilon_subsegment =
epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike];
w_ptr = weights_pointer + *spike * weights_dim_c0;
for (counter = 0; counter < h_dim; counter++)
{
h_temp[counter] = h_subsegment[counter] * w_ptr[counter];
}
h_temp_sum = 0.0;
for (counter = 0; counter < h_dim; counter++)
{
h_temp_sum += h_temp[counter];
}
if (h_temp_sum > 1E-10)
{
temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
for (counter = 0; counter < h_dim; counter++)
{
h_temp[counter] *= temp_value;
}
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] += h_temp[counter];
}
if (forgetting_offset_local > 0.0)
{
temp_value =
epsilon_scale * epsilon_subsegment * forgetting_offset_local;
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] += temp_value;
}
epsilon_scale *=
1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
}
else
{
epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
}
}
}
}
temp_value = 1.0 / epsilon_scale;
for (counter = 0; counter < h_dim; counter++)
{
h_pointer[counter * h_dim_c1] =
h_subsegment[counter] * temp_value;
}
// delete[] h_temp;
// delete[] h_subsegment;
return;
};
__global__ void kernel_spike_generation(
float* __restrict__ h_init_ptr,
float* __restrict__ h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* __restrict__ weights_pointer,
size_t weights_dim_c0,
int64_t* __restrict__ input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* __restrict__ epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* __restrict__ epsilon_t_pointer,
size_t number_of_spikes,
float forgetting_offset,
float forgetting_offset_local,
size_t dim_x,
size_t dim_y,
size_t dim_xy,
size_t max_threadable_tasks,
float* __restrict__ temp_memory_a,
float* __restrict__ temp_memory_b
)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < max_threadable_tasks)
{
float* h_ptr;
float* epsilon_xy_ptr;
int64_t* input_ptr;
float* temp_memory_ptr_a = temp_memory_a + idx * h_dim;
float* temp_memory_ptr_b = temp_memory_b + idx * h_dim;
// int pattern_id = idx;
int pattern_id = idx / dim_xy;
int position_xy = idx - (pattern_id * dim_xy);
// size_t position_x = blockIdx.y;
// size_t position_y = blockIdx.z;
size_t position_x = position_xy / dim_y;
size_t position_y = position_xy - (position_x * dim_y);
epsilon_xy_ptr = epsilon_xy_pointer +
position_x * epsilon_xy_dim_c1 + position_y;
h_ptr = h_pointer +
pattern_id * h_dim_c0 + position_x * h_dim_c2 + position_y;
input_ptr = input_pointer +
pattern_id * input_dim_c0 + position_x * input_dim_c2 + position_y;
gpu_update_one_ip(
h_init_ptr,
h_ptr,
h_dim_c1,
h_dim,
weights_pointer,
weights_dim_c0,
input_ptr,
input_dim_c1,
epsilon_xy_ptr,
epsilon_xy_dim_c0,
epsilon_t_pointer,
number_of_spikes,
forgetting_offset,
forgetting_offset_local,
temp_memory_ptr_a,
temp_memory_ptr_b
);
}
};
// Let's face it... We need a better way to paralelize it...
bool HDynamicCNNManyIP::gpu_update(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* epsilon_t_pointer,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
size_t number_of_spikes,
size_t dim_x,
size_t dim_y,
float forgetting_offset,
float forgetting_offset_local,
size_t number_of_pattern,
size_t gpu_tuning_factor)
{
cudaError_t status;
assert((dim_x < 65535));
assert((dim_y < 65535));
// // //////////////////////////////////////
// // Get infos about the device
// // //////////////////////////////////////
// int device;
// cudaDeviceProp prop;
// status = cudaGetDevice(&device);
// assert((status == cudaSuccess));
// // std::cout << "Device ID: " << device << std::endl;
// status = cudaGetDeviceProperties(&prop, device);
// assert((status == cudaSuccess));
// // std::cout << "Device name: " << prop.name << std::endl;
// int _cuda_heap_size_in_mb = 16;
// status = cudaDeviceSetLimit(cudaLimitMallocHeapSize, _cuda_heap_size_in_mb * (1 << 20));
// assert((status == cudaSuccess));
// size_t pValue;
// cudaDeviceGetLimit(&pValue, cudaLimitMallocHeapSize);
// std::cout << pValue << " " << (pValue/(2*4*h_dim)) << std::endl;
// exit(1);
// //////////////////////////////////////
// Calculate the distribution on the GPU
// //////////////////////////////////////
int min_grid_size;
int block_size;
int grid_size;
size_t dynamic_s_mem_size = 0;
size_t max_threadable_tasks = number_of_pattern * dim_x * dim_y;
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator
status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
(void*)kernel_spike_generation,
dynamic_s_mem_size, max_threadable_tasks);
assert((status == cudaSuccess));
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
// Maximum dimensionality of grid of thread blocks: 3
// Maximum x -dimension of a grid of thread blocks: (2^31)-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// Reduce the automatic block size with our guess
if ((gpu_tuning_factor > 0) && (gpu_tuning_factor < block_size))
{
block_size = int(gpu_tuning_factor);
}
// Round up according to array size
// (I will separate x and y into other grid dimentsions soon)
// grid_size = (number_of_pattern + block_size - 1) / block_size;
grid_size = (max_threadable_tasks + block_size - 1) / block_size;
// std::cout << min_grid_size << std::endl;
// std::cout << grid_size << std::endl;
// std::cout << block_size << std::endl;
// std::cout << max_threadable_tasks << std::endl;
//dim3 grid(grid_size, dim_x, dim_y);
float* temp_memory_a = nullptr;
status = cudaMalloc((void**)&temp_memory_a, h_dim * max_threadable_tasks * sizeof(float));
assert((status == cudaSuccess));
float* temp_memory_b = nullptr;
status = cudaMalloc((void**)&temp_memory_b, h_dim * max_threadable_tasks * sizeof(float));
assert((status == cudaSuccess));
//kernel_spike_generation<<<grid, block_size >>>(
kernel_spike_generation<<<grid_size, block_size >>>(
h_init_ptr,
h_pointer,
h_dim_c0,
h_dim_c1,
h_dim_c2,
h_dim,
weights_pointer,
weights_dim_c0,
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
epsilon_xy_pointer,
epsilon_xy_dim_c0,
epsilon_xy_dim_c1,
epsilon_t_pointer,
number_of_spikes,
forgetting_offset,
forgetting_offset_local,
dim_x,
dim_y,
(dim_x * dim_y),
//number_of_pattern
max_threadable_tasks,
temp_memory_a,
temp_memory_b
);
status = cudaDeviceSynchronize();
assert((status == cudaSuccess));
status = cudaFree(temp_memory_a);
assert((status == cudaSuccess));
status = cudaFree(temp_memory_b);
assert((status == cudaSuccess));
return true;
};

View file

@ -0,0 +1,111 @@
#ifndef SRC_HDYNAMICCNNMANYIP_H_
#define SRC_HDYNAMICCNNMANYIP_H_
#include <unistd.h>
#include <cctype>
#include <iostream>
class HDynamicCNNManyIP
{
public:
HDynamicCNNManyIP();
~HDynamicCNNManyIP();
bool update_entrypoint(
int64_t h_pointer_addr,
int64_t h_dim_0,
int64_t h_dim_1,
int64_t h_dim_2,
int64_t h_dim_3,
int64_t epsilon_xy_pointer_addr,
int64_t epsilon_xy_dim_0,
int64_t epsilon_xy_dim_1,
int64_t epsilon_xy_dim_2,
int64_t epsilon_t_pointer_addr,
int64_t epsilon_t_dim_0,
int64_t weights_pointer_addr,
int64_t weights_dim_0,
int64_t weights_dim_1,
int64_t input_pointer_addr,
int64_t input_dim_0,
int64_t input_dim_1,
int64_t input_dim_2,
int64_t input_dim_3,
int64_t init_vector_pointer_addr,
int64_t init_vector_dim_0,
int64_t number_of_processes,
float forgetting_offset,
int64_t gpu_tuning_factor);
private:
bool update(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* epsilon_t_pointer,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
size_t number_of_spikes,
size_t dim_x,
size_t dim_y,
float forgetting_offset,
float forgetting_offset_local,
size_t pattern_id);
void update_one_ip(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c1,
size_t h_dim,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c1,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
float* epsilon_t_pointer,
size_t number_of_spikes,
float forgetting_offset,
float forgetting_offset_local);
bool gpu_update(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* epsilon_t_pointer,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
size_t number_of_spikes,
size_t dim_x,
size_t dim_y,
float forgetting_offset,
float forgetting_offset_local,
size_t number_of_pattern,
size_t gpu_tuning_factor);
};
#endif /* SRC_HDYNAMICCNNMANYIP_H_ */

67
network/CPP_Cuda/Makefile Normal file
View file

@ -0,0 +1,67 @@
# Change to your python bin directory (tested with Python 3.10.4)
PYBIN=~/P3.10GPU/bin/
NVCC=/usr/local/cuda-12/bin/nvcc -allow-unsupported-compiler
CC=/usr/lib64/ccache/clang++
PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes`
PARAMETERS_O= -O3 -std=c++14 $(PYBIND11INCLUDE) -ccbin=$(CC) \
-Xcompiler "-fPIC -Wall -fopenmp=libomp"
PARAMETERS_Linker=-Xcompiler "-shared -lm -lomp -lstdc++ -Wall"
PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix`
all: PyHDynamicCNNManyIP \
PySpikeGeneration2DManyIP \
PyMultiApp
#######################
HDynamicCNNManyIP.o: HDynamicCNNManyIP.h HDynamicCNNManyIP.cu
$(NVCC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cu -o HDynamicCNNManyIP.o
PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp
$(NVCC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o
PyHDynamicCNNManyIP: HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
$(NVCC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX)
$(PYBIN)python3 pybind11_auto_pyi.py
#######################
SpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h SpikeGeneration2DManyIP.cu
$(NVCC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cu -o SpikeGeneration2DManyIP.o
PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp
$(NVCC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o
PySpikeGeneration2DManyIP: SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
$(NVCC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX)
$(PYBIN)python3 pybind11_auto_pyi.py
#######################
MultiApp.o: MultiApp.h MultiApp.cu approximation_multiplication_function.cpp \
gpu_approximation_multiplication_function.cu error_term.cpp gpu_error_term.cu
$(NVCC) $(PARAMETERS_O) -c MultiApp.cu -o MultiApp.o
PyMultiApp.o: MultiApp.h PyMultiApp.cpp
$(NVCC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o
PyMultiApp: MultiApp.o PyMultiApp.o
$(NVCC) $(PARAMETERS_Linker) -o PyMultiApp MultiApp.o PyMultiApp.o
cp PyMultiApp PyMultiApp$(PYPOSTFIX)
$(PYBIN)python3 pybind11_auto_pyi.py
#######################
clean:
rm -f PyHDynamicCNNManyIP
rm -f PySpikeGeneration2DManyIP
rm -f PyMultiApp
rm -f *.o
rm -f *.so

View file

@ -0,0 +1,313 @@
#include "MultiApp.h"
#include <omp.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <vector>
#include "approximation_multiplication_function.cpp"
#include "gpu_approximation_multiplication_function.cu"
MultiApp::MultiApp()
{
};
MultiApp::~MultiApp()
{
};
bool MultiApp::update(float* np_input_pointer,
float* np_weight_pointer,
float* np_output_pointer, int64_t pattern_dim,
int64_t feature_dim, int64_t x_dim, int64_t y_dim,
int64_t input_channel_dim, int64_t id_pattern,
bool approximation_enable, int64_t number_of_trunc_bits,
int64_t number_of_frac_bits)
{
assert((id_pattern >= 0));
assert((id_pattern < pattern_dim));
float* np_input_pointer_pattern;
float* np_output_pointer_pattern;
float* input_ptr;
float* output_ptr;
float* w_ptr;
uint64_t pattern_size = input_channel_dim;
std::vector<float> ap_h_vector;
ap_h_vector.resize(pattern_size);
float* ap_h_ptr = ap_h_vector.data();
std::vector<uint32_t> ap_x_vector;
ap_x_vector.resize(pattern_size);
uint32_t* ap_x_ptr = ap_x_vector.data();
std::vector<uint32_t> ap_y_vector;
ap_y_vector.resize(pattern_size);
uint32_t* ap_y_ptr = ap_y_vector.data();
std::vector<uint32_t> ap_x_exponent_vector;
ap_x_exponent_vector.resize(pattern_size);
uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data();
std::vector<uint32_t> ap_y_exponent_vector;
ap_y_exponent_vector.resize(pattern_size);
uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data();
std::vector<uint32_t> ap_h_exponent_vector;
ap_h_exponent_vector.resize(pattern_size);
uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data();
std::vector<uint64_t> ap_res_vector;
ap_res_vector.resize(pattern_size);
uint64_t* ap_res_ptr = ap_res_vector.data();
uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
std::vector<uint32_t> sign_temp_vector;
sign_temp_vector.resize(pattern_size);
uint32_t* sign_temp_ptr = sign_temp_vector.data();
uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim;
uint64_t output_pattern_size = feature_dim * x_dim * y_dim;
np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size;
np_output_pointer_pattern =
np_output_pointer + id_pattern * output_pattern_size;
uint64_t counter;
uint64_t counter_x;
uint64_t counter_y;
uint64_t counter_feature;
uint64_t pos_xy;
uint64_t pos_xy_if;
float temp_sum;
uint64_t pattern_c_2 = x_dim * y_dim;
for (counter_x = 0; counter_x < x_dim; counter_x++)
{
for (counter_y = 0; counter_y < y_dim; counter_y++)
{
pos_xy = counter_y + counter_x * y_dim;
for (counter_feature = 0; counter_feature < feature_dim;
counter_feature++)
{
pos_xy_if = counter_feature * pattern_c_2 + pos_xy;
input_ptr = np_input_pointer_pattern + pos_xy;
output_ptr = np_output_pointer_pattern + pos_xy_if;
w_ptr = np_weight_pointer + counter_feature * input_channel_dim;
#pragma omp simd
for (counter = 0; counter < pattern_size; counter++)
{
ap_h_ptr[counter] = input_ptr[counter * pattern_c_2];
}
approximation_multiplication_function(
ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits,
number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr,
ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr,
sign_temp_ptr, approximation_enable);
temp_sum = 0.0;
#pragma omp simd reduction(+ \
: temp_sum)
for (counter = 0; counter < pattern_size; counter++)
{
temp_sum += ap_h_ptr[counter];
}
output_ptr[0] = temp_sum;
}
}
}
return true;
};
bool MultiApp::update_with_init_vector_multi_pattern(
int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
int64_t number_of_processes, bool approximation_enable,
int64_t number_of_trunc_bits, int64_t number_of_frac)
{
int64_t number_of_pattern = pattern_dim;
int64_t pattern_id;
float* np_input_pointer = (float*)np_input_pointer_addr;
float* np_weight_pointer = (float*)np_weight_pointer_addr;
float* np_output_pointer = (float*)np_output_pointer_addr;
assert((np_input_pointer != nullptr));
assert((np_output_pointer != nullptr));
assert((np_weight_pointer != nullptr));
assert((pattern_dim > 0));
assert((feature_dim > 0));
assert((x_dim > 0));
assert((y_dim > 0));
assert((input_channel_dim > 0));
if (number_of_processes > 0)
{
omp_set_num_threads(number_of_processes);
// For debugging: Only one thread
// omp_set_num_threads(1);
#pragma omp parallel for
for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
{
update(np_input_pointer, np_weight_pointer,
np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim,
input_channel_dim, pattern_id, approximation_enable,
number_of_trunc_bits, number_of_frac);
}
}
else
{
update_gpu(np_input_pointer, np_weight_pointer,
np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim,
input_channel_dim, approximation_enable,
number_of_trunc_bits, number_of_frac);
}
return true;
};
__global__ void kernel_approx_multiplication(float* __restrict__ input_pointer, float* __restrict__ weight_pointer,
float* __restrict__ output_pointer, uint64_t pattern_dim,
uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
uint64_t input_channel_dim, size_t max_threadable_tasks,
uint64_t input_index_scale, uint64_t number_of_frac_bits,
bool approximation_enable, uint64_t number_of_trunc_bits,
uint32_t ap_mask)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < max_threadable_tasks)
{
int pattern_id = idx / feature_dim;
int feature_id = idx - (pattern_id * feature_dim);
int x_id = blockIdx.y;
int y_id = blockIdx.z;
float* weight_pointer_sub = weight_pointer + feature_id * input_channel_dim;
float* input_pointer_sub = input_pointer + pattern_id * input_channel_dim * x_dim * y_dim + x_id * y_dim + y_id;
float* output_pointer_sub = output_pointer +
pattern_id * feature_dim * x_dim * y_dim +
feature_id * x_dim * y_dim + x_id * y_dim + y_id;
*output_pointer_sub = 0.0;
size_t counter;
for (counter = 0; counter < input_channel_dim; counter++)
{
*output_pointer_sub += gpu_approximation_multiplication_function(
weight_pointer_sub[counter],
input_pointer_sub[counter * input_index_scale],
number_of_frac_bits, approximation_enable,
number_of_trunc_bits, ap_mask);
}
}
};
bool MultiApp::update_gpu(float* np_input_pointer,
float* np_weight_pointer,
float* np_output_pointer, uint64_t pattern_dim,
uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
uint64_t input_channel_dim,
bool approximation_enable, uint64_t number_of_trunc_bits,
uint64_t number_of_frac_bits)
{
uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
// std::cout << approximation_enable << std::endl;
// std::cout << number_of_trunc_bits << std::endl;
// std::cout << number_of_frac_bits << std::endl;
cudaError_t status;
assert((x_dim < 65535));
assert((y_dim < 65535));
// //////////////////////////////////////
// Get infos about the device
// //////////////////////////////////////
int device;
cudaDeviceProp prop;
status = cudaGetDevice(&device);
assert((status == cudaSuccess));
// std::cout << "Device ID: " << device << std::endl;
status = cudaGetDeviceProperties(&prop, device);
assert((status == cudaSuccess));
// std::cout << "Device name: " << prop.name << std::endl;
// //////////////////////////////////////
// Calculate the distribution on the GPU
// //////////////////////////////////////
int min_grid_size;
int block_size;
int grid_size;
size_t dynamic_s_mem_size = 0;
size_t max_threadable_tasks = pattern_dim * feature_dim * x_dim * y_dim;
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator
status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
(void*)kernel_approx_multiplication,
dynamic_s_mem_size, max_threadable_tasks);
assert((status == cudaSuccess));
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
// Maximum dimensionality of grid of thread blocks: 3
// Maximum x -dimension of a grid of thread blocks: (2^31)-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// Round up according to array size
grid_size = ((pattern_dim * feature_dim) + block_size - 1) / block_size;
// std::cout << min_grid_size << std::endl;
// std::cout << grid_size << std::endl;
// std::cout << block_size << std::endl;
// std::cout << max_threadable_tasks << std::endl;
dim3 grid(grid_size, x_dim, y_dim);
kernel_approx_multiplication<<<grid, block_size>>>(np_input_pointer,
np_weight_pointer,
np_output_pointer,
pattern_dim,
feature_dim,
x_dim,
y_dim,
input_channel_dim,
(pattern_dim * feature_dim),
(x_dim * y_dim),
number_of_frac_bits,
approximation_enable,
number_of_trunc_bits,
ap_mask);
cudaDeviceSynchronize();
return true;
};

View file

@ -0,0 +1,39 @@
#ifndef SRC_MultiApp_H_
#define SRC_MultiApp_H_
#include <unistd.h>
#include <cctype>
#include <iostream>
class MultiApp
{
public:
MultiApp();
~MultiApp();
bool update(float *np_input_pointer, float *np_weight_pointer,
float *np_output_pointer, int64_t pattern_dim,
int64_t feature_dim, int64_t x_dim, int64_t y_dim,
int64_t input_channel_dim, int64_t id_pattern,
bool approximation_enable, int64_t number_of_trunc_bits,
int64_t number_of_frac);
bool update_gpu(float *input_pointer, float *weight_pointer,
float *output_pointer, uint64_t pattern_dim,
uint64_t feature_dim, uint64_t x_dim, uint64_t y_dim,
uint64_t input_channel_dim,
bool approximation_enable, uint64_t number_of_trunc_bits,
uint64_t number_of_frac);
bool update_with_init_vector_multi_pattern(
int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
int64_t number_of_processes, bool approximation_enable,
int64_t number_of_trunc_bits, int64_t number_of_frac);
private:
};
#endif /* SRC_MultiApp_H_ */

View file

@ -0,0 +1,14 @@
#include <pybind11/pybind11.h>
#include "HDynamicCNNManyIP.h"
namespace py = pybind11;
PYBIND11_MODULE(PyHDynamicCNNManyIP, m)
{
m.doc() = "HDynamicCNNManyIP Module";
py::class_<HDynamicCNNManyIP>(m, "HDynamicCNNManyIP")
.def(py::init<>())
.def("update",
&HDynamicCNNManyIP::update_entrypoint);
}

View file

@ -0,0 +1,18 @@
#
# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
#
"""HDynamicCNNManyIP Module"""
from __future__ import annotations
import PyHDynamicCNNManyIP
import typing
__all__ = [
"HDynamicCNNManyIP"
]
class HDynamicCNNManyIP():
def __init__(self) -> None: ...
def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ...
pass

View file

@ -0,0 +1,14 @@
#include <pybind11/pybind11.h>
#include "MultiApp.h"
namespace py = pybind11;
PYBIND11_MODULE(PyMultiApp, m) {
m.doc() = "MultiApp Module";
py::class_<MultiApp>(m, "MultiApp")
.def(py::init<>())
.def("update_with_init_vector_multi_pattern",
&MultiApp::update_with_init_vector_multi_pattern);
}

View file

@ -0,0 +1,18 @@
#
# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
#
"""MultiApp Module"""
from __future__ import annotations
import PyMultiApp
import typing
__all__ = [
"MultiApp"
]
class MultiApp():
def __init__(self) -> None: ...
def update_with_init_vector_multi_pattern(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ...
pass

View file

@ -0,0 +1,15 @@
#include <pybind11/pybind11.h>
#include "SpikeGeneration2DManyIP.h"
namespace py = pybind11;
PYBIND11_MODULE(PySpikeGeneration2DManyIP, m)
{
m.doc() = "SpikeGeneration2DManyIP Module";
py::class_<SpikeGeneration2DManyIP>(m, "SpikeGeneration2DManyIP")
.def(py::init<>())
.def("spike_generation",
&SpikeGeneration2DManyIP::spike_generation_entrypoint);
}

View file

@ -0,0 +1,18 @@
#
# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
#
"""SpikeGeneration2DManyIP Module"""
from __future__ import annotations
import PySpikeGeneration2DManyIP
import typing
__all__ = [
"SpikeGeneration2DManyIP"
]
class SpikeGeneration2DManyIP():
def __init__(self) -> None: ...
def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ...
pass

View file

@ -0,0 +1,390 @@
#include "SpikeGeneration2DManyIP.h"
#include <omp.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <cassert>
#include <iostream>
SpikeGeneration2DManyIP::SpikeGeneration2DManyIP()
{
};
SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP()
{
};
bool SpikeGeneration2DManyIP::spike_generation_entrypoint(
int64_t input_pointer_addr, int64_t input_dim_0,
int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
int64_t random_values_pointer_addr, int64_t random_values_dim_0,
int64_t random_values_dim_1, int64_t random_values_dim_2,
int64_t random_values_dim_3, int64_t output_pointer_addr,
int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
int64_t output_dim_3, int64_t number_of_cpu_processes)
{
float* input_pointer = (float*)input_pointer_addr;
float* random_values_pointer = (float*)random_values_pointer_addr;
int64_t* output_pointer = (int64_t*)output_pointer_addr;
// Input
assert((input_pointer != nullptr));
assert((input_dim_0 > 0));
assert((input_dim_1 > 0));
assert((input_dim_2 > 0));
assert((input_dim_3 > 0));
// Random
assert((random_values_pointer != nullptr));
assert((random_values_dim_0 > 0));
assert((random_values_dim_1 > 0));
assert((random_values_dim_2 > 0));
assert((random_values_dim_3 > 0));
// Output
assert((output_pointer != nullptr));
assert((output_dim_0 > 0));
assert((output_dim_1 > 0));
assert((output_dim_2 > 0));
assert((output_dim_3 > 0));
// Input
size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
size_t input_dim_c1 = input_dim_2 * input_dim_3;
size_t input_dim_c2 = input_dim_3;
// Random
size_t random_values_dim_c0 =
random_values_dim_1 * random_values_dim_2 * random_values_dim_3;
size_t random_values_dim_c1 =
random_values_dim_2 * random_values_dim_3;
size_t random_values_dim_c2 = random_values_dim_3;
// Output
size_t output_dim_c0 =
output_dim_1 * output_dim_2 * output_dim_3;
size_t output_dim_c1 = output_dim_2 * output_dim_3;
size_t output_dim_c2 = output_dim_3;
size_t number_of_pattern = input_dim_0;
size_t h_dim = input_dim_1;
size_t spike_dim = output_dim_1;
size_t x_dim = output_dim_2;
size_t y_dim = output_dim_2;
if (number_of_cpu_processes > 0)
{
omp_set_num_threads(number_of_cpu_processes);
// DEBUG:
// omp_set_num_threads(1);
size_t pattern_id;
#pragma omp parallel for
for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
{
spike_generation(
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
random_values_pointer,
random_values_dim_c0,
random_values_dim_c1,
random_values_dim_c2,
output_pointer,
output_dim_c0,
output_dim_c1,
output_dim_c2,
x_dim,
y_dim,
spike_dim,
h_dim,
pattern_id);
}
}
else
{
gpu_spike_generation(
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
random_values_pointer,
random_values_dim_c0,
random_values_dim_c1,
random_values_dim_c2,
output_pointer,
output_dim_c0,
output_dim_c1,
output_dim_c2,
x_dim,
y_dim,
spike_dim,
h_dim,
number_of_pattern);
}
return true;
};
bool SpikeGeneration2DManyIP::spike_generation(
float* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t pattern_id)
{
size_t counter;
size_t counter_x = 0;
size_t counter_y = 0;
float* p_ptr = nullptr;
int64_t* out_ptr = nullptr;
float* rand_ptr = nullptr;
for (counter_x = 0; counter_x < x_dim; counter_x++)
{
for (counter_y = 0; counter_y < y_dim; counter_y++)
{
p_ptr = input_pointer + pattern_id * input_dim_c0 +
counter_x * input_dim_c2 + counter_y;
// + counter * input_dim_c1
out_ptr = output_pointer + pattern_id * output_dim_c0 +
counter_x * output_dim_c2 + counter_y;
// + counter * output_dim_c1
rand_ptr = random_values_pointer +
pattern_id * random_values_dim_c0 +
counter_x * random_values_dim_c2 + counter_y;
// + counter * random_values_dim_c1
for (counter = 0; counter < spike_dim; counter++)
{
out_ptr[counter * output_dim_c1] = lower_bound(p_ptr,
h_dim,
input_dim_c1,
rand_ptr[counter * random_values_dim_c1]);
}
}
}
return true;
};
// algorithmic idea stolen from libc++
size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr,
size_t data_length,
size_t data_ptr_stride,
float compare_to_value)
{
size_t start_of_range = 0;
size_t length_of_range = data_length;
while (length_of_range != 0)
{
size_t half_length = length_of_range >> 1;
size_t actual_position = start_of_range + half_length;
if (data_ptr[actual_position * data_ptr_stride] < compare_to_value)
{
start_of_range = ++actual_position;
length_of_range -= half_length + 1;
}
else
length_of_range = half_length;
}
return start_of_range;
};
__device__ size_t gpu_lower_bound(float* __restrict__ data_ptr,
size_t data_length,
size_t data_ptr_stride,
float compare_to_value)
{
size_t start_of_range = 0;
size_t length_of_range = data_length;
while (length_of_range != 0)
{
size_t half_length = length_of_range >> 1;
size_t actual_position = start_of_range + half_length;
if (data_ptr[actual_position * data_ptr_stride] < compare_to_value)
{
start_of_range = ++actual_position;
length_of_range -= half_length + 1;
}
else
length_of_range = half_length;
}
return start_of_range;
};
__global__ void kernel_spike_generation(
float* __restrict__ input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* __restrict__ random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* __restrict__ output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t max_threadable_tasks)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < max_threadable_tasks)
{
size_t pattern_id = idx / spike_dim;
size_t position_spike = idx - (pattern_id * spike_dim);
size_t position_x = blockIdx.y;
size_t position_y = blockIdx.z;
float* p_ptr = input_pointer + pattern_id * input_dim_c0 +
position_x * input_dim_c2 + position_y;
int64_t* out_ptr = output_pointer + pattern_id * output_dim_c0 +
position_x * output_dim_c2 + position_y
+ position_spike * output_dim_c1;
float* rand_ptr = random_values_pointer +
pattern_id * random_values_dim_c0 +
position_x * random_values_dim_c2 + position_y
+ position_spike * random_values_dim_c1;
*out_ptr = gpu_lower_bound(p_ptr,
h_dim,
input_dim_c1,
*rand_ptr);
}
};
bool SpikeGeneration2DManyIP::gpu_spike_generation(
float* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t number_of_pattern)
{
cudaError_t status;
assert((x_dim < 65535));
assert((y_dim < 65535));
// // //////////////////////////////////////
// // Get infos about the device
// // //////////////////////////////////////
// int device;
// cudaDeviceProp prop;
// status = cudaGetDevice(&device);
// assert((status == cudaSuccess));
// // std::cout << "Device ID: " << device << std::endl;
// status = cudaGetDeviceProperties(&prop, device);
// assert((status == cudaSuccess));
// // std::cout << "Device name: " << prop.name << std::endl;
// //////////////////////////////////////
// Calculate the distribution on the GPU
// //////////////////////////////////////
int min_grid_size;
int block_size;
int grid_size;
size_t dynamic_s_mem_size = 0;
size_t max_threadable_tasks = number_of_pattern * spike_dim * x_dim * y_dim;
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=blocksize#occupancy-calculator
status = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
(void*)kernel_spike_generation,
dynamic_s_mem_size, max_threadable_tasks);
assert((status == cudaSuccess));
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
// Maximum dimensionality of grid of thread blocks: 3
// Maximum x -dimension of a grid of thread blocks: (2^31)-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// Round up according to array size
// (I will separate x and y into other grid dimentsions soon)
grid_size = ((number_of_pattern * spike_dim) + block_size - 1) / block_size;
// std::cout << min_grid_size << std::endl;
// std::cout << grid_size << std::endl;
// std::cout << block_size << std::endl;
// std::cout << max_threadable_tasks << std::endl;
dim3 grid(grid_size, x_dim, y_dim);
kernel_spike_generation<<<grid, block_size >>>(
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
random_values_pointer,
random_values_dim_c0,
random_values_dim_c1,
random_values_dim_c2,
output_pointer,
output_dim_c0,
output_dim_c1,
output_dim_c2,
x_dim,
y_dim,
spike_dim,
h_dim,
(number_of_pattern * spike_dim));
cudaDeviceSynchronize();
return true;
};

View file

@ -0,0 +1,68 @@
#ifndef SRC_SPIKEGENERATION2DMANYIP_H_
#define SRC_SPIKEGENERATION2DMANYIP_H_
#include <unistd.h>
#include <cctype>
#include <iostream>
class SpikeGeneration2DManyIP
{
public:
SpikeGeneration2DManyIP();
~SpikeGeneration2DManyIP();
bool spike_generation_entrypoint(
int64_t input_pointer_addr, int64_t input_dim_0,
int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
int64_t random_values_pointer_addr, int64_t random_values_dim_0,
int64_t random_values_dim_1, int64_t random_values_dim_2,
int64_t random_values_dim_3, int64_t output_pointer_addr,
int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
int64_t output_dim_3, int64_t number_of_cpu_processes);
bool spike_generation(
float* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t pattern_id);
bool gpu_spike_generation(
float* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t number_of_pattern);
private:
size_t lower_bound(float* data_ptr, size_t data_length,
size_t data_ptr_stride,
float compare_to_value);
};
#endif /* SRC_SPIKEGENERATION2DMANYIP_H_ */

View file

@ -0,0 +1,138 @@
#include <unistd.h>
#include <bitset>
#include <cassert>
#include <cctype>
#include "error_term.cpp"
// Best way to plot the bits
// std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n";
// The result needs to be written back into h_pointer (which contains h)
// Don't write to w_pointer.
void approximation_multiplication_function(
float *h_pointer, float *w_pointer, int64_t pattern_length,
uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits,
uint32_t *ap_x_ptr, uint32_t *ap_y_ptr, uint32_t *ap_x_exponent_ptr,
uint32_t *ap_y_exponent_ptr, uint32_t *ap_h_exponent_ptr, uint32_t ap_mask,
uint64_t *ap_res_ptr, uint32_t *sign_temp_ptr, bool approximation_enable) {
uint64_t counter;
uint32_t *w_pointer_mod = (uint32_t *)w_pointer;
uint32_t *h_pointer_mod = (uint32_t *)h_pointer;
// Calculate the new sign
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^
(h_pointer_mod[counter] & 0x80000000);
}
// Extract the exponent
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24;
}
// Cast and "normalize"
uint64_t shift_value = 32 - number_of_frac_bits;
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_x_ptr[counter] =
((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_y_ptr[counter] =
((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
}
// Make the zero -g-r-e-a-t- correct again
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
if (h_pointer[counter] == 0) {
ap_x_ptr[counter] = 0;
}
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
if (w_pointer[counter] == 0) {
ap_y_ptr[counter] = 0;
}
}
// res = x*y
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_res_ptr[counter] = static_cast<uint64_t>(ap_x_ptr[counter]) * static_cast<uint64_t>(ap_y_ptr[counter]);
}
uint32_t temp;
if (approximation_enable == true){
// Go through the vector values
for (counter = 0; counter < pattern_length; counter++) {
temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask,
number_of_trunc_bits);
if (temp > ap_res_ptr[counter]) {
ap_res_ptr[counter] = 0;
} else {
ap_res_ptr[counter] -= temp;
}
}
}
// Cast from int to float
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer[counter] = static_cast<float>(ap_res_ptr[counter]);
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
}
// devide by the 2^number_of_frac_bits
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits;
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] +
ap_h_exponent_ptr[counter];
if (temp > 252) {
ap_h_exponent_ptr[counter] = temp - 252;
} else {
// Here I try to catch the case that the new exponent is too small
ap_h_exponent_ptr[counter] = 0;
}
}
// Remove the old exponent
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9;
}
// Install the new exponent
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23;
}
// Add the sign back
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer_mod[counter] += sign_temp_ptr[counter];
}
return;
}

View file

@ -0,0 +1,28 @@
#include <unistd.h>
#include <cassert>
#include <cctype>
uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask,
uint32_t number_of_trunc_bits) {
uint32_t error_value = 0;
uint32_t temp_shift_a = a;
uint32_t temp_shift_b = b & ap_mask;
uint32_t counter_trunc;
uint32_t temp;
// Go through the bits
for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
counter_trunc++) {
temp = temp_shift_a & 1;
if (temp == 1) {
error_value += temp_shift_b & ap_mask;
}
temp_shift_a >>= 1;
temp_shift_b <<= 1;
}
return error_value;
}

View file

@ -0,0 +1,102 @@
#include "gpu_error_term.cu"
__device__ float gpu_approximation_multiplication_function(
float weight,
float input,
uint64_t number_of_frac_bits,
bool approximation_enable,
uint64_t number_of_trunc_bits,
uint32_t ap_mask)
{
float weight_copy = weight;
float input_copy = input;
uint32_t *weight_pointer_mod = (uint32_t *)&weight_copy;
uint32_t *input_pointer_mod = (uint32_t *)&input_copy;
// Calculate the new sign
uint32_t sign_temp = (*weight_pointer_mod & 0x80000000) ^
(*input_pointer_mod & 0x80000000);
// Extract the exponent
uint32_t ap_input_exponent = (*input_pointer_mod << 1) >> 24;
uint32_t ap_weight_exponent = (*weight_pointer_mod << 1) >> 24;
// Cast and "normalize"
uint64_t shift_value = 32 - number_of_frac_bits;
uint32_t ap_input_mantissa =
((*input_pointer_mod << 8) | 0x80000000) >> shift_value;
uint32_t ap_weight_mantissa =
((*weight_pointer_mod << 8) | 0x80000000) >> shift_value;
// Make the zero -g-r-e-a-t- correct again
if (input == 0)
{
ap_input_mantissa = 0;
}
if (weight == 0)
{
ap_weight_mantissa = 0;
}
// res = x*y
uint64_t ap_result = static_cast<uint64_t>(ap_input_mantissa) * static_cast<uint64_t>(ap_weight_mantissa);
uint32_t temp;
// --------------------------------------------
// Approx
// --------------------------------------------
if (approximation_enable == true)
{
// Go through the vector values
temp = gpu_error_term(ap_weight_mantissa, ap_input_mantissa, ap_mask,
number_of_trunc_bits);
if (temp > ap_result)
{
ap_result = 0;
}
else
{
ap_result -= temp;
}
}
// Cast from int to float
float output = static_cast<float>(ap_result);
if (ap_result == 0)
{
output = 0.0;
}
else
{
uint32_t *output_pointer_mod = (uint32_t *)&output;
uint32_t ap_output_exponent = (*output_pointer_mod << 1) >> 24;
ap_output_exponent -= 2 * number_of_frac_bits;
temp = ap_input_exponent + ap_weight_exponent + ap_output_exponent;
if (temp > 252)
{
ap_output_exponent = temp - 252;
}
else
{
// Here I try to catch the case that the new exponent is too small
ap_output_exponent = 0;
}
// Remove the old exponent
*output_pointer_mod = (*output_pointer_mod << 9) >> 9;
// Install the new exponent
*output_pointer_mod += ap_output_exponent << 23;
// Add the sign back
*output_pointer_mod += sign_temp;
}
return output;
};

View file

@ -0,0 +1,28 @@
__device__ uint32_t gpu_error_term(uint32_t ap_weight_mantissa,
uint32_t ap_input_mantissa,
uint32_t ap_mask,
uint32_t number_of_trunc_bits)
{
uint32_t error_value = 0;
uint32_t temp_shift_a = ap_weight_mantissa;
uint32_t temp_shift_b = ap_input_mantissa & ap_mask;
uint32_t counter_trunc;
uint32_t temp;
// Go through the bits
for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
counter_trunc++)
{
temp = temp_shift_a & 1;
if (temp == 1)
{
error_value += temp_shift_b & ap_mask;
}
temp_shift_a >>= 1;
temp_shift_b <<= 1;
}
return error_value;
}

View file

@ -0,0 +1,23 @@
# %%
# pip install pybind11-stubgen
from pybind11_stubgen import ModuleStubsGenerator # type: ignore
import glob
def process(module_name: str) -> None:
module = ModuleStubsGenerator(module_name)
module.parse()
module.write_setup_py = False
with open(module_name + ".pyi", "w") as fp:
fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n")
fp.write("\n".join(module.to_lines()))
Files = glob.glob("*.so")
for fid in Files:
Idx: int = fid.find(".")
module_name: str = fid[:Idx]
print("Processing: " + module_name)
process(module_name)

View file

@ -0,0 +1,339 @@
#include "HDynamicCNNManyIP.h"
#include <omp.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <cassert>
#include <iostream>
HDynamicCNNManyIP::HDynamicCNNManyIP()
{
};
HDynamicCNNManyIP::~HDynamicCNNManyIP()
{
};
bool HDynamicCNNManyIP::update_entrypoint(
int64_t h_pointer_addr,
int64_t h_dim_0,
int64_t h_dim_1,
int64_t h_dim_2,
int64_t h_dim_3,
int64_t epsilon_xy_pointer_addr,
int64_t epsilon_xy_dim_0,
int64_t epsilon_xy_dim_1,
int64_t epsilon_xy_dim_2,
int64_t epsilon_t_pointer_addr,
int64_t epsilon_t_dim_0,
int64_t weights_pointer_addr,
int64_t weights_dim_0,
int64_t weights_dim_1,
int64_t input_pointer_addr,
int64_t input_dim_0,
int64_t input_dim_1,
int64_t input_dim_2,
int64_t input_dim_3,
int64_t init_vector_pointer_addr,
int64_t init_vector_dim_0,
int64_t number_of_processes,
float forgetting_offset,
int64_t gpu_tuning_factor)
{
size_t number_of_pattern = input_dim_0;
size_t h_dim = init_vector_dim_0;
float* h_init_ptr = (float*)init_vector_pointer_addr;
assert((h_init_ptr != nullptr));
assert((h_dim > 0));
float* h_pointer = (float*)h_pointer_addr;
assert((h_pointer != nullptr));
assert((h_dim_0 > 0));
assert((h_dim_1 > 0));
assert((h_dim_2 > 0));
assert((h_dim_3 > 0));
size_t h_dim_c0 = h_dim_1 * h_dim_2 * h_dim_3;
size_t h_dim_c1 = h_dim_2 * h_dim_3;
size_t h_dim_c2 = h_dim_3;
float* epsilon_xy_pointer = (float*)epsilon_xy_pointer_addr;
assert((epsilon_xy_pointer != nullptr));
assert((epsilon_xy_dim_0 > 0));
assert((epsilon_xy_dim_1 > 0));
size_t epsilon_xy_dim_c0 = epsilon_xy_dim_2 * epsilon_xy_dim_1;
size_t epsilon_xy_dim_c1 = epsilon_xy_dim_2;
float* epsilon_t_pointer = (float*)epsilon_t_pointer_addr;
assert((epsilon_t_pointer != nullptr));
assert((epsilon_t_dim_0 > 0));
float* weights_pointer = (float*)weights_pointer_addr;
assert((weights_pointer != nullptr));
assert((weights_dim_0 > 0));
assert((weights_dim_1 > 0));
size_t weights_dim_c0 = weights_dim_1;
int64_t* input_pointer = (int64_t*)input_pointer_addr;
assert((input_pointer != nullptr));
assert((input_dim_0 > 0));
assert((input_dim_1 > 0));
assert((input_dim_2 > 0));
assert((input_dim_3 > 0));
size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
size_t input_dim_c1 = input_dim_2 * input_dim_3;
size_t input_dim_c2 = input_dim_3;
assert((h_dim == weights_dim_1));
size_t number_of_spikes = input_dim_1;
size_t dim_x = input_dim_2;
size_t dim_y = input_dim_3;
float forgetting_offset_local = forgetting_offset / static_cast<float>(h_dim);
// --------------------
if (number_of_processes > 0)
{
omp_set_num_threads(number_of_processes);
size_t pattern_id;
#pragma omp parallel for
for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
{
update(
h_init_ptr,
h_pointer,
h_dim_c0,
h_dim_c1,
h_dim_c2,
h_dim,
epsilon_xy_pointer,
epsilon_xy_dim_c0,
epsilon_xy_dim_c1,
epsilon_t_pointer,
weights_pointer,
weights_dim_c0,
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
number_of_spikes,
dim_x,
dim_y,
forgetting_offset,
forgetting_offset_local,
pattern_id);
}
}
else
{
std::cout << "Error: number_of_processes <= 0" << std::endl;
return false;
}
return true;
};
bool HDynamicCNNManyIP::update(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* epsilon_t_pointer,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
size_t number_of_spikes,
size_t dim_x,
size_t dim_y,
float forgetting_offset,
float forgetting_offset_local,
size_t pattern_id)
{
float* h_ptr;
float* epsilon_xy_ptr;
int64_t* input_ptr;
size_t counter_x;
size_t counter_y;
for (counter_x = 0; counter_x < dim_x; counter_x++)
{
for (counter_y = 0; counter_y < dim_y; counter_y++)
{
epsilon_xy_ptr = epsilon_xy_pointer +
counter_x * epsilon_xy_dim_c1 + counter_y;
h_ptr = h_pointer +
pattern_id * h_dim_c0 + counter_x * h_dim_c2 + counter_y;
input_ptr = input_pointer +
pattern_id * input_dim_c0 + counter_x * input_dim_c2 + counter_y;
update_one_ip(
h_init_ptr,
h_ptr,
h_dim_c1,
h_dim,
weights_pointer,
weights_dim_c0,
input_ptr,
input_dim_c1,
epsilon_xy_ptr,
epsilon_xy_dim_c0,
epsilon_t_pointer,
number_of_spikes,
forgetting_offset,
forgetting_offset_local);
}
}
return true;
};
void HDynamicCNNManyIP::update_one_ip(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c1,
size_t h_dim,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c1,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
float* epsilon_t_pointer,
size_t number_of_spikes,
float forgetting_offset,
float forgetting_offset_local)
{
float* h_temp = new float[h_dim];
float* h_subsegment = new float[h_dim];
memcpy(h_subsegment, h_init_ptr, sizeof(float) * h_dim);
size_t counter_spike;
size_t counter;
float h_temp_sum;
float temp_value;
float epsilon_subsegment;
float epsilon_scale = 1.0;
int64_t* spike;
float* w_ptr;
for (counter_spike = 0; counter_spike < number_of_spikes; counter_spike++)
{
if (epsilon_scale > 1E10)
{
temp_value = 1.0 / epsilon_scale;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] *= temp_value;
}
epsilon_scale = 1.0;
}
spike = input_pointer + counter_spike * input_dim_c1;
if (*spike >= 0)
{
epsilon_subsegment =
epsilon_xy_pointer[*spike *epsilon_xy_dim_c0] * epsilon_t_pointer[counter_spike];
w_ptr = weights_pointer + *spike * weights_dim_c0;
memcpy(h_temp, h_subsegment, sizeof(float) * h_dim);
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_temp[counter] *= w_ptr[counter];
}
h_temp_sum = 0.0;
#pragma omp simd reduction(+ : h_temp_sum)
for (counter = 0; counter < h_dim; counter++)
{
h_temp_sum += h_temp[counter];
}
if (h_temp_sum > 1E-10)
{
temp_value = epsilon_scale * epsilon_subsegment / h_temp_sum;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_temp[counter] *= temp_value;
}
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] += h_temp[counter];
}
if (forgetting_offset_local > 0.0)
{
temp_value =
epsilon_scale * epsilon_subsegment * forgetting_offset_local;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_subsegment[counter] += temp_value;
}
epsilon_scale *=
1.0 + epsilon_subsegment * (1.0 + forgetting_offset);
}
else
{
epsilon_scale *= 1.0 + epsilon_subsegment * 1.0;
}
}
}
}
temp_value = 1.0 / epsilon_scale;
#pragma omp simd
for (counter = 0; counter < h_dim; counter++)
{
h_pointer[counter * h_dim_c1] =
h_subsegment[counter] * temp_value;
}
delete[] h_temp;
delete[] h_subsegment;
return;
};

View file

@ -0,0 +1,85 @@
#ifndef SRC_HDYNAMICCNNMANYIP_H_
#define SRC_HDYNAMICCNNMANYIP_H_
#include <unistd.h>
#include <cctype>
#include <iostream>
class HDynamicCNNManyIP
{
public:
HDynamicCNNManyIP();
~HDynamicCNNManyIP();
bool update_entrypoint(
int64_t h_pointer_addr,
int64_t h_dim_0,
int64_t h_dim_1,
int64_t h_dim_2,
int64_t h_dim_3,
int64_t epsilon_xy_pointer_addr,
int64_t epsilon_xy_dim_0,
int64_t epsilon_xy_dim_1,
int64_t epsilon_xy_dim_2,
int64_t epsilon_t_pointer_addr,
int64_t epsilon_t_dim_0,
int64_t weights_pointer_addr,
int64_t weights_dim_0,
int64_t weights_dim_1,
int64_t input_pointer_addr,
int64_t input_dim_0,
int64_t input_dim_1,
int64_t input_dim_2,
int64_t input_dim_3,
int64_t init_vector_pointer_addr,
int64_t init_vector_dim_0,
int64_t number_of_processes,
float forgetting_offset,
int64_t gpu_tuning_factor);
private:
bool update(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c0,
size_t h_dim_c1,
size_t h_dim_c2,
size_t h_dim,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
size_t epsilon_xy_dim_c1,
float* epsilon_t_pointer,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
size_t number_of_spikes,
size_t dim_x,
size_t dim_y,
float forgetting_offset,
float forgetting_offset_local,
size_t pattern_id);
void update_one_ip(
float* h_init_ptr,
float* h_pointer,
size_t h_dim_c1,
size_t h_dim,
float* weights_pointer,
size_t weights_dim_c0,
int64_t* input_pointer,
size_t input_dim_c1,
float* epsilon_xy_pointer,
size_t epsilon_xy_dim_c0,
float* epsilon_t_pointer,
size_t number_of_spikes,
float forgetting_offset,
float forgetting_offset_local);
};
#endif /* SRC_HDYNAMICCNNMANYIP_H_ */

View file

@ -0,0 +1,64 @@
# Change to your python bin directory (tested with Python 3.10.4)
PYBIN=~/P3.10GPU/bin/
CC=/usr/lib64/ccache/clang++
PYBIND11INCLUDE=`$(PYBIN)python3 -m pybind11 --includes`
PARAMETERS_O= -O3 -std=c++14 $(PYBIND11INCLUDE) -fPIC -Wall -fopenmp=libomp
PARAMETERS_Linker=-shared -lm -lomp -lstdc++ -Wall
PYPOSTFIX=`$(PYBIN)python3-config --extension-suffix`
all: PyHDynamicCNNManyIP \
PySpikeGeneration2DManyIP \
PyMultiApp
#######################
HDynamicCNNManyIP.o: HDynamicCNNManyIP.h HDynamicCNNManyIP.cpp
$(CC) $(PARAMETERS_O) -c HDynamicCNNManyIP.cpp -o HDynamicCNNManyIP.o
PyHDynamicCNNManyIP.o: HDynamicCNNManyIP.h PyHDynamicCNNManyIP.cpp
$(CC) $(PARAMETERS_O) -c PyHDynamicCNNManyIP.cpp -o PyHDynamicCNNManyIP.o
PyHDynamicCNNManyIP: HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
$(CC) $(PARAMETERS_Linker) -o PyHDynamicCNNManyIP HDynamicCNNManyIP.o PyHDynamicCNNManyIP.o
cp PyHDynamicCNNManyIP PyHDynamicCNNManyIP$(PYPOSTFIX)
$(PYBIN)python3 pybind11_auto_pyi.py
#######################
SpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h SpikeGeneration2DManyIP.cpp
$(CC) $(PARAMETERS_O) -c SpikeGeneration2DManyIP.cpp -o SpikeGeneration2DManyIP.o
PySpikeGeneration2DManyIP.o: SpikeGeneration2DManyIP.h PySpikeGeneration2DManyIP.cpp
$(CC) $(PARAMETERS_O) -c PySpikeGeneration2DManyIP.cpp -o PySpikeGeneration2DManyIP.o
PySpikeGeneration2DManyIP: SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
$(CC) $(PARAMETERS_Linker) -o PySpikeGeneration2DManyIP SpikeGeneration2DManyIP.o PySpikeGeneration2DManyIP.o
cp PySpikeGeneration2DManyIP PySpikeGeneration2DManyIP$(PYPOSTFIX)
$(PYBIN)python3 pybind11_auto_pyi.py
#######################
MultiApp.o: MultiApp.h MultiApp.cpp approximation_multiplication_function.cpp \
error_term.cpp
$(CC) $(PARAMETERS_O) -c MultiApp.cpp -o MultiApp.o
PyMultiApp.o: MultiApp.h PyMultiApp.cpp
$(CC) $(PARAMETERS_O) -c PyMultiApp.cpp -o PyMultiApp.o
PyMultiApp: MultiApp.o PyMultiApp.o
$(CC) $(PARAMETERS_Linker) -o PyMultiApp MultiApp.o PyMultiApp.o
cp PyMultiApp PyMultiApp$(PYPOSTFIX)
$(PYBIN)python3 pybind11_auto_pyi.py
#######################
clean:
rm -f PyHDynamicCNNManyIP
rm -f PySpikeGeneration2DManyIP
rm -f PyMultiApp
rm -f *.o
rm -f *.so

View file

@ -0,0 +1,187 @@
#include "MultiApp.h"
#include <omp.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <vector>
#include "approximation_multiplication_function.cpp"
MultiApp::MultiApp()
{
};
MultiApp::~MultiApp()
{
};
bool MultiApp::update(float* np_input_pointer,
float* np_weight_pointer,
float* np_output_pointer, int64_t pattern_dim,
int64_t feature_dim, int64_t x_dim, int64_t y_dim,
int64_t input_channel_dim, int64_t id_pattern,
bool approximation_enable, int64_t number_of_trunc_bits,
int64_t number_of_frac_bits)
{
assert((id_pattern >= 0));
assert((id_pattern < pattern_dim));
float* np_input_pointer_pattern;
float* np_output_pointer_pattern;
float* input_ptr;
float* output_ptr;
float* w_ptr;
uint64_t pattern_size = input_channel_dim;
std::vector<float> ap_h_vector;
ap_h_vector.resize(pattern_size);
float* ap_h_ptr = ap_h_vector.data();
std::vector<uint32_t> ap_x_vector;
ap_x_vector.resize(pattern_size);
uint32_t* ap_x_ptr = ap_x_vector.data();
std::vector<uint32_t> ap_y_vector;
ap_y_vector.resize(pattern_size);
uint32_t* ap_y_ptr = ap_y_vector.data();
std::vector<uint32_t> ap_x_exponent_vector;
ap_x_exponent_vector.resize(pattern_size);
uint32_t* ap_x_exponent_ptr = ap_x_exponent_vector.data();
std::vector<uint32_t> ap_y_exponent_vector;
ap_y_exponent_vector.resize(pattern_size);
uint32_t* ap_y_exponent_ptr = ap_y_exponent_vector.data();
std::vector<uint32_t> ap_h_exponent_vector;
ap_h_exponent_vector.resize(pattern_size);
uint32_t* ap_h_exponent_ptr = ap_h_exponent_vector.data();
std::vector<uint64_t> ap_res_vector;
ap_res_vector.resize(pattern_size);
uint64_t* ap_res_ptr = ap_res_vector.data();
uint32_t ap_mask = static_cast<uint64_t>(pow(2, number_of_trunc_bits)) - 1;
std::vector<uint32_t> sign_temp_vector;
sign_temp_vector.resize(pattern_size);
uint32_t* sign_temp_ptr = sign_temp_vector.data();
uint64_t input_pattern_size = input_channel_dim * x_dim * y_dim;
uint64_t output_pattern_size = feature_dim * x_dim * y_dim;
np_input_pointer_pattern = np_input_pointer + id_pattern * input_pattern_size;
np_output_pointer_pattern =
np_output_pointer + id_pattern * output_pattern_size;
uint64_t counter;
uint64_t counter_x;
uint64_t counter_y;
uint64_t counter_feature;
uint64_t pos_xy;
uint64_t pos_xy_if;
float temp_sum;
uint64_t pattern_c_2 = x_dim * y_dim;
for (counter_x = 0; counter_x < x_dim; counter_x++)
{
for (counter_y = 0; counter_y < y_dim; counter_y++)
{
pos_xy = counter_y + counter_x * y_dim;
for (counter_feature = 0; counter_feature < feature_dim;
counter_feature++)
{
pos_xy_if = counter_feature * pattern_c_2 + pos_xy;
input_ptr = np_input_pointer_pattern + pos_xy;
output_ptr = np_output_pointer_pattern + pos_xy_if;
w_ptr = np_weight_pointer + counter_feature * input_channel_dim;
#pragma omp simd
for (counter = 0; counter < pattern_size; counter++)
{
ap_h_ptr[counter] = input_ptr[counter * pattern_c_2];
}
approximation_multiplication_function(
ap_h_ptr, w_ptr, pattern_size, number_of_trunc_bits,
number_of_frac_bits, ap_x_ptr, ap_y_ptr, ap_x_exponent_ptr,
ap_y_exponent_ptr, ap_h_exponent_ptr, ap_mask, ap_res_ptr,
sign_temp_ptr, approximation_enable);
temp_sum = 0.0;
#pragma omp simd reduction(+ \
: temp_sum)
for (counter = 0; counter < pattern_size; counter++)
{
temp_sum += ap_h_ptr[counter];
}
output_ptr[0] = temp_sum;
}
}
}
return true;
};
bool MultiApp::update_with_init_vector_multi_pattern(
int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
int64_t number_of_processes, bool approximation_enable,
int64_t number_of_trunc_bits, int64_t number_of_frac)
{
int64_t number_of_pattern = pattern_dim;
int64_t pattern_id;
float* np_input_pointer = (float*)np_input_pointer_addr;
float* np_weight_pointer = (float*)np_weight_pointer_addr;
float* np_output_pointer = (float*)np_output_pointer_addr;
assert((np_input_pointer != nullptr));
assert((np_output_pointer != nullptr));
assert((np_weight_pointer != nullptr));
assert((pattern_dim > 0));
assert((feature_dim > 0));
assert((x_dim > 0));
assert((y_dim > 0));
assert((input_channel_dim > 0));
if (number_of_processes > 0)
{
omp_set_num_threads(number_of_processes);
// For debugging: Only one thread
// omp_set_num_threads(1);
#pragma omp parallel for
for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
{
update(np_input_pointer, np_weight_pointer,
np_output_pointer, pattern_dim, feature_dim, x_dim, y_dim,
input_channel_dim, pattern_id, approximation_enable,
number_of_trunc_bits, number_of_frac);
}
}
else
{
std::cout << "Error: number_of_processes <= 0" << std::endl;
return false;
}
return true;
};

View file

@ -0,0 +1,32 @@
#ifndef SRC_MultiApp_H_
#define SRC_MultiApp_H_
#include <unistd.h>
#include <cctype>
#include <iostream>
class MultiApp
{
public:
MultiApp();
~MultiApp();
bool update(float* np_input_pointer, float* np_weight_pointer,
float* np_output_pointer, int64_t pattern_dim,
int64_t feature_dim, int64_t x_dim, int64_t y_dim,
int64_t input_channel_dim, int64_t id_pattern,
bool approximation_enable, int64_t number_of_trunc_bits,
int64_t number_of_frac);
bool update_with_init_vector_multi_pattern(
int64_t np_input_pointer_addr, int64_t np_weight_pointer_addr,
int64_t np_output_pointer_addr, int64_t pattern_dim, int64_t feature_dim,
int64_t x_dim, int64_t y_dim, int64_t input_channel_dim,
int64_t number_of_processes, bool approximation_enable,
int64_t number_of_trunc_bits, int64_t number_of_frac);
private:
};
#endif /* SRC_MultiApp_H_ */

View file

@ -0,0 +1,14 @@
#include <pybind11/pybind11.h>
#include "HDynamicCNNManyIP.h"
namespace py = pybind11;
PYBIND11_MODULE(PyHDynamicCNNManyIP, m)
{
m.doc() = "HDynamicCNNManyIP Module";
py::class_<HDynamicCNNManyIP>(m, "HDynamicCNNManyIP")
.def(py::init<>())
.def("update",
&HDynamicCNNManyIP::update_entrypoint);
}

View file

@ -0,0 +1,18 @@
#
# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
#
"""HDynamicCNNManyIP Module"""
from __future__ import annotations
import PyHDynamicCNNManyIP
import typing
__all__ = [
"HDynamicCNNManyIP"
]
class HDynamicCNNManyIP():
def __init__(self) -> None: ...
def update(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int, arg16: int, arg17: int, arg18: int, arg19: int, arg20: int, arg21: int, arg22: float, arg23: int) -> bool: ...
pass

View file

@ -0,0 +1,14 @@
#include <pybind11/pybind11.h>
#include "MultiApp.h"
namespace py = pybind11;
PYBIND11_MODULE(PyMultiApp, m) {
m.doc() = "MultiApp Module";
py::class_<MultiApp>(m, "MultiApp")
.def(py::init<>())
.def("update_with_init_vector_multi_pattern",
&MultiApp::update_with_init_vector_multi_pattern);
}

View file

@ -0,0 +1,18 @@
#
# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
#
"""MultiApp Module"""
from __future__ import annotations
import PyMultiApp
import typing
__all__ = [
"MultiApp"
]
class MultiApp():
def __init__(self) -> None: ...
def update_with_init_vector_multi_pattern(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: bool, arg10: int, arg11: int) -> bool: ...
pass

View file

@ -0,0 +1,15 @@
#include <pybind11/pybind11.h>
#include "SpikeGeneration2DManyIP.h"
namespace py = pybind11;
PYBIND11_MODULE(PySpikeGeneration2DManyIP, m)
{
m.doc() = "SpikeGeneration2DManyIP Module";
py::class_<SpikeGeneration2DManyIP>(m, "SpikeGeneration2DManyIP")
.def(py::init<>())
.def("spike_generation",
&SpikeGeneration2DManyIP::spike_generation_entrypoint);
}

View file

@ -0,0 +1,18 @@
#
# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!
#
"""SpikeGeneration2DManyIP Module"""
from __future__ import annotations
import PySpikeGeneration2DManyIP
import typing
__all__ = [
"SpikeGeneration2DManyIP"
]
class SpikeGeneration2DManyIP():
def __init__(self) -> None: ...
def spike_generation(self, arg0: int, arg1: int, arg2: int, arg3: int, arg4: int, arg5: int, arg6: int, arg7: int, arg8: int, arg9: int, arg10: int, arg11: int, arg12: int, arg13: int, arg14: int, arg15: int) -> bool: ...
pass

View file

@ -0,0 +1,205 @@
#include "SpikeGeneration2DManyIP.h"
#include <omp.h>
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <cassert>
#include <iostream>
SpikeGeneration2DManyIP::SpikeGeneration2DManyIP()
{
};
SpikeGeneration2DManyIP::~SpikeGeneration2DManyIP()
{
};
bool SpikeGeneration2DManyIP::spike_generation_entrypoint(
int64_t input_pointer_addr, int64_t input_dim_0,
int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
int64_t random_values_pointer_addr, int64_t random_values_dim_0,
int64_t random_values_dim_1, int64_t random_values_dim_2,
int64_t random_values_dim_3, int64_t output_pointer_addr,
int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
int64_t output_dim_3, int64_t number_of_cpu_processes)
{
float* input_pointer = (float*)input_pointer_addr;
float* random_values_pointer = (float*)random_values_pointer_addr;
int64_t* output_pointer = (int64_t*)output_pointer_addr;
// Input
assert((input_pointer != nullptr));
assert((input_dim_0 > 0));
assert((input_dim_1 > 0));
assert((input_dim_2 > 0));
assert((input_dim_3 > 0));
// Random
assert((random_values_pointer != nullptr));
assert((random_values_dim_0 > 0));
assert((random_values_dim_1 > 0));
assert((random_values_dim_2 > 0));
assert((random_values_dim_3 > 0));
// Output
assert((output_pointer != nullptr));
assert((output_dim_0 > 0));
assert((output_dim_1 > 0));
assert((output_dim_2 > 0));
assert((output_dim_3 > 0));
// Input
size_t input_dim_c0 = input_dim_1 * input_dim_2 * input_dim_3;
size_t input_dim_c1 = input_dim_2 * input_dim_3;
size_t input_dim_c2 = input_dim_3;
// Random
size_t random_values_dim_c0 =
random_values_dim_1 * random_values_dim_2 * random_values_dim_3;
size_t random_values_dim_c1 =
random_values_dim_2 * random_values_dim_3;
size_t random_values_dim_c2 = random_values_dim_3;
// Output
size_t output_dim_c0 =
output_dim_1 * output_dim_2 * output_dim_3;
size_t output_dim_c1 = output_dim_2 * output_dim_3;
size_t output_dim_c2 = output_dim_3;
size_t number_of_pattern = input_dim_0;
size_t h_dim = input_dim_1;
size_t spike_dim = output_dim_1;
size_t x_dim = output_dim_2;
size_t y_dim = output_dim_2;
if (number_of_cpu_processes > 0)
{
omp_set_num_threads(number_of_cpu_processes);
// DEBUG:
// omp_set_num_threads(1);
size_t pattern_id;
#pragma omp parallel for
for (pattern_id = 0; pattern_id < number_of_pattern; pattern_id++)
{
spike_generation(
input_pointer,
input_dim_c0,
input_dim_c1,
input_dim_c2,
random_values_pointer,
random_values_dim_c0,
random_values_dim_c1,
random_values_dim_c2,
output_pointer,
output_dim_c0,
output_dim_c1,
output_dim_c2,
x_dim,
y_dim,
spike_dim,
h_dim,
pattern_id);
}
}
else
{
std::cout << "Error: number_of_processes <= 0" << std::endl;
return false;
}
return true;
};
bool SpikeGeneration2DManyIP::spike_generation(
float* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t pattern_id)
{
size_t counter;
size_t counter_x = 0;
size_t counter_y = 0;
float* p_ptr = nullptr;
int64_t* out_ptr = nullptr;
float* rand_ptr = nullptr;
for (counter_x = 0; counter_x < x_dim; counter_x++)
{
for (counter_y = 0; counter_y < y_dim; counter_y++)
{
p_ptr = input_pointer + pattern_id * input_dim_c0 +
counter_x * input_dim_c2 + counter_y;
// + counter * input_dim_c1
out_ptr = output_pointer + pattern_id * output_dim_c0 +
counter_x * output_dim_c2 + counter_y;
// + counter * output_dim_c1
rand_ptr = random_values_pointer +
pattern_id * random_values_dim_c0 +
counter_x * random_values_dim_c2 + counter_y;
// + counter * random_values_dim_c1
for (counter = 0; counter < spike_dim; counter++)
{
out_ptr[counter * output_dim_c1] = lower_bound(p_ptr,
h_dim,
input_dim_c1,
rand_ptr[counter * random_values_dim_c1]);
}
}
}
return true;
};
// algorithmic idea stolen from libc++
size_t SpikeGeneration2DManyIP::lower_bound(float* data_ptr,
size_t data_length,
size_t data_ptr_stride,
float compare_to_value)
{
size_t start_of_range = 0;
size_t length_of_range = data_length;
while (length_of_range != 0)
{
size_t half_length = length_of_range >> 1;
size_t actual_position = start_of_range + half_length;
if (data_ptr[actual_position * data_ptr_stride] < compare_to_value)
{
start_of_range = ++actual_position;
length_of_range -= half_length + 1;
}
else
length_of_range = half_length;
}
return start_of_range;
};

View file

@ -0,0 +1,49 @@
#ifndef SRC_SPIKEGENERATION2DMANYIP_H_
#define SRC_SPIKEGENERATION2DMANYIP_H_
#include <unistd.h>
#include <cctype>
#include <iostream>
class SpikeGeneration2DManyIP
{
public:
SpikeGeneration2DManyIP();
~SpikeGeneration2DManyIP();
bool spike_generation_entrypoint(
int64_t input_pointer_addr, int64_t input_dim_0,
int64_t input_dim_1, int64_t input_dim_2, int64_t input_dim_3,
int64_t random_values_pointer_addr, int64_t random_values_dim_0,
int64_t random_values_dim_1, int64_t random_values_dim_2,
int64_t random_values_dim_3, int64_t output_pointer_addr,
int64_t output_dim_0, int64_t output_dim_1, int64_t output_dim_2,
int64_t output_dim_3, int64_t number_of_cpu_processes);
bool spike_generation(
float* input_pointer,
size_t input_dim_c0,
size_t input_dim_c1,
size_t input_dim_c2,
float* random_values_pointer,
size_t random_values_dim_c0,
size_t random_values_dim_c1,
size_t random_values_dim_c2,
int64_t* output_pointer,
size_t output_dim_c0,
size_t output_dim_c1,
size_t output_dim_c2,
size_t x_dim,
size_t y_dim,
size_t spike_dim,
size_t h_dim,
size_t pattern_id);
private:
size_t lower_bound(float* data_ptr, size_t data_length,
size_t data_ptr_stride,
float compare_to_value);
};
#endif /* SRC_SPIKEGENERATION2DMANYIP_H_ */

View file

@ -0,0 +1,138 @@
#include <unistd.h>
#include <bitset>
#include <cassert>
#include <cctype>
#include "error_term.cpp"
// Best way to plot the bits
// std::cout << std::bitset<32>(ap_y_ptr[1]) << "\n";
// The result needs to be written back into h_pointer (which contains h)
// Don't write to w_pointer.
void approximation_multiplication_function(
float *h_pointer, float *w_pointer, int64_t pattern_length,
uint64_t number_of_trunc_bits, uint64_t number_of_frac_bits,
uint32_t *ap_x_ptr, uint32_t *ap_y_ptr, uint32_t *ap_x_exponent_ptr,
uint32_t *ap_y_exponent_ptr, uint32_t *ap_h_exponent_ptr, uint32_t ap_mask,
uint64_t *ap_res_ptr, uint32_t *sign_temp_ptr, bool approximation_enable) {
uint64_t counter;
uint32_t *w_pointer_mod = (uint32_t *)w_pointer;
uint32_t *h_pointer_mod = (uint32_t *)h_pointer;
// Calculate the new sign
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
sign_temp_ptr[counter] = (w_pointer_mod[counter] & 0x80000000) ^
(h_pointer_mod[counter] & 0x80000000);
}
// Extract the exponent
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_x_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_y_exponent_ptr[counter] = (w_pointer_mod[counter] << 1) >> 24;
}
// Cast and "normalize"
uint64_t shift_value = 32 - number_of_frac_bits;
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_x_ptr[counter] =
((h_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_y_ptr[counter] =
((w_pointer_mod[counter] << 8) | 0x80000000) >> shift_value;
}
// Make the zero -g-r-e-a-t- correct again
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
if (h_pointer[counter] == 0) {
ap_x_ptr[counter] = 0;
}
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
if (w_pointer[counter] == 0) {
ap_y_ptr[counter] = 0;
}
}
// res = x*y
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_res_ptr[counter] = static_cast<uint64_t>(ap_x_ptr[counter]) * static_cast<uint64_t>(ap_y_ptr[counter]);
}
uint32_t temp;
if (approximation_enable == true){
// Go through the vector values
for (counter = 0; counter < pattern_length; counter++) {
temp = error_term(ap_y_ptr[counter], ap_x_ptr[counter], ap_mask,
number_of_trunc_bits);
if (temp > ap_res_ptr[counter]) {
ap_res_ptr[counter] = 0;
} else {
ap_res_ptr[counter] -= temp;
}
}
}
// Cast from int to float
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer[counter] = static_cast<float>(ap_res_ptr[counter]);
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_h_exponent_ptr[counter] = (h_pointer_mod[counter] << 1) >> 24;
}
// devide by the 2^number_of_frac_bits
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
ap_h_exponent_ptr[counter] -= 2 * number_of_frac_bits;
}
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
temp = ap_x_exponent_ptr[counter] + ap_y_exponent_ptr[counter] +
ap_h_exponent_ptr[counter];
if (temp > 252) {
ap_h_exponent_ptr[counter] = temp - 252;
} else {
// Here I try to catch the case that the new exponent is too small
ap_h_exponent_ptr[counter] = 0;
}
}
// Remove the old exponent
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer_mod[counter] = (h_pointer_mod[counter] << 9) >> 9;
}
// Install the new exponent
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer_mod[counter] += ap_h_exponent_ptr[counter] << 23;
}
// Add the sign back
#pragma omp simd
for (counter = 0; counter < pattern_length; counter++) {
h_pointer_mod[counter] += sign_temp_ptr[counter];
}
return;
}

View file

@ -0,0 +1,28 @@
#include <unistd.h>
#include <cassert>
#include <cctype>
uint32_t error_term(uint32_t a, uint32_t b, uint32_t ap_mask,
uint32_t number_of_trunc_bits) {
uint32_t error_value = 0;
uint32_t temp_shift_a = a;
uint32_t temp_shift_b = b & ap_mask;
uint32_t counter_trunc;
uint32_t temp;
// Go through the bits
for (counter_trunc = 0; counter_trunc < number_of_trunc_bits;
counter_trunc++) {
temp = temp_shift_a & 1;
if (temp == 1) {
error_value += temp_shift_b & ap_mask;
}
temp_shift_a >>= 1;
temp_shift_b <<= 1;
}
return error_value;
}

View file

@ -0,0 +1,23 @@
# %%
# pip install pybind11-stubgen
from pybind11_stubgen import ModuleStubsGenerator # type: ignore
import glob
def process(module_name: str) -> None:
module = ModuleStubsGenerator(module_name)
module.parse()
module.write_setup_py = False
with open(module_name + ".pyi", "w") as fp:
fp.write("#\n# AUTOMATICALLY GENERATED FILE, DO NOT EDIT!\n#\n\n")
fp.write("\n".join(module.to_lines()))
Files = glob.glob("*.so")
for fid in Files:
Idx: int = fid.find(".")
module_name: str = fid[:Idx]
print("Processing: " + module_name)
process(module_name)

View file

@ -0,0 +1,318 @@
import torch
import math
from network.CPP.PyMultiApp import MultiApp
class Conv2dApproximation(torch.nn.Module):
in_channels: int | None = None
out_channels: int | None = None
kernel_size: list[int] | None = None
stride: list[int] = [1, 1]
padding: list[int] = [0, 0]
dilation: list[int] = [1, 1]
use_bias: bool = False
approximation_enable: bool = False
number_of_trunc_bits: int = -1
number_of_frac: int = -1
number_of_processes: int = 1
weights: torch.nn.parameter.Parameter
bias: torch.nn.parameter.Parameter | None
device: torch.device
dtype: torch.dtype
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: list[int],
stride: list[int] = [1, 1],
padding: list[int] = [0, 0],
dilation: list[int] = [1, 1],
bias: bool = True,
approximation_enable: bool = False,
number_of_trunc_bits: int = -1,
number_of_frac: int = -1,
number_of_processes: int = 1,
device: torch.device | None = None,
dtype: torch.dtype | None = None,
) -> None:
super().__init__()
assert device is not None
self.device = device
assert dtype is not None
self.dtype = dtype
assert len(kernel_size) == 2
assert len(stride) == 2
assert len(padding) == 2
assert len(dilation) == 2
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.use_bias = bias
self.number_of_processes = number_of_processes
self.approximation_enable = approximation_enable
self.number_of_trunc_bits = number_of_trunc_bits
self.number_of_frac = number_of_frac
if self.use_bias is True:
self.bias: torch.nn.parameter.Parameter | None = (
torch.nn.parameter.Parameter(
torch.empty(
(out_channels),
dtype=self.dtype,
device=self.device,
)
)
)
else:
self.bias = None
self.weights: torch.nn.parameter.Parameter = torch.nn.parameter.Parameter(
torch.empty(
(out_channels, in_channels, *kernel_size),
dtype=self.dtype,
device=self.device,
)
)
self.functional_multi = FunctionalMultiConv2d.apply
self.reset_parameters()
def reset_parameters(self) -> None:
# Stolen from original torch conv2 code
torch.nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights)
if fan_in != 0:
bound = 1 / math.sqrt(fan_in)
torch.nn.init.uniform_(self.bias, -bound, bound)
def calculate_output_size(self, value: torch.Tensor) -> None:
coordinates_0, coordinates_1 = self._get_coordinates(value)
self.output_size: torch.Tensor = torch.tensor(
[
coordinates_0.shape[1],
coordinates_1.shape[1],
],
dtype=torch.int64,
)
self.output_size.requires_grad_(False)
def _get_coordinates(
self, value: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor]:
"""Function converts parameter in coordinates
for the convolution window"""
assert value is not None
assert torch.is_tensor(value) is True
assert value.dim() == 1
assert torch.numel(value) == 2
assert value.dtype == torch.int64
assert value[0] > 0
assert value[1] > 0
assert self.kernel_size is not None
assert len(self.kernel_size) == 2
assert len(self.stride) == 2
assert len(self.dilation) == 2
assert len(self.padding) == 2
unfold_0: torch.nn.Unfold = torch.nn.Unfold(
kernel_size=(int(self.kernel_size[0]), 1),
dilation=int(self.dilation[0]),
padding=int(self.padding[0]),
stride=int(self.stride[0]),
)
unfold_1: torch.nn.Unfold = torch.nn.Unfold(
kernel_size=(1, int(self.kernel_size[1])),
dilation=int(self.dilation[1]),
padding=int(self.padding[1]),
stride=int(self.stride[1]),
)
coordinates_0: torch.Tensor = (
unfold_0(
torch.unsqueeze(
torch.unsqueeze(
torch.unsqueeze(
torch.arange(0, int(value[0]), dtype=torch.float32),
1,
),
0,
),
0,
)
)
.squeeze(0)
.type(torch.int64)
)
coordinates_1: torch.Tensor = (
unfold_1(
torch.unsqueeze(
torch.unsqueeze(
torch.unsqueeze(
torch.arange(0, int(value[1]), dtype=torch.float32),
0,
),
0,
),
0,
)
)
.squeeze(0)
.type(torch.int64)
)
return coordinates_0, coordinates_1
def forward(self, input: torch.Tensor) -> torch.Tensor:
assert input.dim() == 4
assert self.kernel_size is not None
input_size = torch.Tensor([int(input.shape[-2]), int(input.shape[-1])]).type(
dtype=torch.int64
)
self.calculate_output_size(input_size)
input_fold = torch.nn.functional.fold(
torch.nn.functional.unfold(
input.requires_grad_(True),
tuple(self.kernel_size),
tuple(self.dilation),
tuple(self.padding),
tuple(self.stride),
),
output_size=(int(self.output_size[0]), int(self.output_size[1])),
kernel_size=(1, 1),
dilation=(1, 1),
padding=(0, 0),
stride=(1, 1),
)
weights_fold = torch.nn.functional.unfold(
self.weights.requires_grad_(True),
tuple(self.kernel_size),
tuple(self.dilation),
tuple(self.padding),
tuple(self.stride),
).squeeze(-1)
if input.device == torch.device("cpu"):
number_of_cpu_processes: int = int(self.number_of_processes)
else:
number_of_cpu_processes = -1
# Here...
parameter_list = torch.tensor(
[
int(self.approximation_enable), # 0
int(self.number_of_trunc_bits), # 1
int(self.number_of_frac), # 2
int(number_of_cpu_processes), # 3
],
dtype=torch.int64,
)
output = self.functional_multi(input_fold, weights_fold, parameter_list)
if self.bias is not None:
output += self.bias.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
return output
class FunctionalMultiConv2d(torch.autograd.Function):
@staticmethod
def forward( # type: ignore
ctx,
input: torch.Tensor,
weights: torch.Tensor,
parameter_list: torch.Tensor,
) -> torch.Tensor:
assert input.ndim == 4
assert input.dtype is torch.float32
assert input.is_contiguous() is True
assert weights.ndim == 2
assert weights.dtype is torch.float32
assert weights.is_contiguous() is True
assert input.shape[1] == weights.shape[1]
approximation_enable = bool(parameter_list[0])
number_of_trunc_bits = int(parameter_list[1])
number_of_frac = int(parameter_list[2])
number_of_processes = int(parameter_list[3])
assert input.device == weights.device
output = torch.empty(
(input.shape[0], weights.shape[0], input.shape[2], input.shape[3]),
dtype=weights.dtype,
device=weights.device,
requires_grad=True,
)
assert output.is_contiguous() is True
multiplier: MultiApp = MultiApp()
multiplier.update_with_init_vector_multi_pattern(
input.data_ptr(),
weights.data_ptr(),
output.data_ptr(),
int(output.shape[0]), # pattern
int(output.shape[1]), # feature channel
int(output.shape[2]), # x
int(output.shape[3]), # y
int(input.shape[1]), # input channel
int(number_of_processes),
bool(approximation_enable),
int(number_of_trunc_bits),
int(number_of_frac),
)
ctx.save_for_backward(
input.detach(),
weights.detach(),
)
return output
@staticmethod
def backward(ctx, grad_output):
(input, weights) = ctx.saved_tensors
grad_input = (
grad_output.unsqueeze(2) * weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
).sum(1)
grad_weights = (
(grad_output.unsqueeze(2) * input.unsqueeze(1)).sum(0).sum(-1).sum(-1)
)
grad_parameter_list = None
return (grad_input, grad_weights, grad_parameter_list)

317
network/Dataset.py Normal file
View file

@ -0,0 +1,317 @@
from abc import ABC, abstractmethod
import torch
import numpy as np
import torchvision as tv # type: ignore
from network.Parameter import Config
class DatasetMaster(torch.utils.data.Dataset, ABC):
path_label: str
label_storage: np.ndarray
pattern_storage: np.ndarray
number_of_pattern: int
mean: list[float]
# Initialize
def __init__(
self,
train: bool = False,
path_pattern: str = "./",
path_label: str = "./",
) -> None:
super().__init__()
if train is True:
self.label_storage = np.load(path_label + "/TrainLabelStorage.npy")
else:
self.label_storage = np.load(path_label + "/TestLabelStorage.npy")
if train is True:
self.pattern_storage = np.load(path_pattern + "/TrainPatternStorage.npy")
else:
self.pattern_storage = np.load(path_pattern + "/TestPatternStorage.npy")
self.number_of_pattern = self.label_storage.shape[0]
self.mean = []
def __len__(self) -> int:
return self.number_of_pattern
# Get one pattern at position index
@abstractmethod
def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
pass
@abstractmethod
def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
pass
@abstractmethod
def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
pass
class DatasetMNIST(DatasetMaster):
"""Contstructor"""
# Initialize
def __init__(
self,
train: bool = False,
path_pattern: str = "./",
path_label: str = "./",
) -> None:
super().__init__(train, path_pattern, path_label)
self.pattern_storage = np.ascontiguousarray(
self.pattern_storage[:, np.newaxis, :, :].astype(dtype=np.float32)
)
self.pattern_storage /= np.max(self.pattern_storage)
mean = self.pattern_storage.mean(3).mean(2).mean(0)
self.mean = [*mean]
def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
image = self.pattern_storage[index, 0:1, :, :]
target = int(self.label_storage[index])
return torch.tensor(image), target
def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
"""0. The test image comes in
1. is center cropped
2. returned.
This is a 1 channel version (e.g. one gray channel).
"""
assert len(cfg.image_statistics.mean) == 1
assert len(cfg.image_statistics.the_size) == 2
assert cfg.image_statistics.the_size[0] > 0
assert cfg.image_statistics.the_size[1] > 0
# Transformation chain
my_transforms: torch.nn.Sequential = torch.nn.Sequential(
tv.transforms.CenterCrop(size=cfg.image_statistics.the_size),
)
scripted_transforms = torch.jit.script(my_transforms)
# Preprocess the input data
pattern = scripted_transforms(pattern)
gray = pattern[:, 0:1, :, :] + 1e-20
return gray
def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
"""0. The training image comes in
1. is cropped from a random position
2. returned.
This is a 1 channel version (e.g. one gray channel).
"""
assert len(cfg.image_statistics.mean) == 1
assert len(cfg.image_statistics.the_size) == 2
assert cfg.image_statistics.the_size[0] > 0
assert cfg.image_statistics.the_size[1] > 0
# Transformation chain
my_transforms: torch.nn.Sequential = torch.nn.Sequential(
tv.transforms.RandomCrop(size=cfg.image_statistics.the_size),
)
scripted_transforms = torch.jit.script(my_transforms)
# Preprocess the input data
pattern = scripted_transforms(pattern)
gray = pattern[:, 0:1, :, :] + 1e-20
return gray
class DatasetFashionMNIST(DatasetMaster):
"""Contstructor"""
# Initialize
def __init__(
self,
train: bool = False,
path_pattern: str = "./",
path_label: str = "./",
) -> None:
super().__init__(train, path_pattern, path_label)
self.pattern_storage = np.ascontiguousarray(
self.pattern_storage[:, np.newaxis, :, :].astype(dtype=np.float32)
)
self.pattern_storage /= np.max(self.pattern_storage)
mean = self.pattern_storage.mean(3).mean(2).mean(0)
self.mean = [*mean]
def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
image = self.pattern_storage[index, 0:1, :, :]
target = int(self.label_storage[index])
return torch.tensor(image), target
def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
"""0. The test image comes in
1. is center cropped
2. returned.
This is a 1 channel version (e.g. one gray channel).
"""
assert len(cfg.image_statistics.mean) == 1
assert len(cfg.image_statistics.the_size) == 2
assert cfg.image_statistics.the_size[0] > 0
assert cfg.image_statistics.the_size[1] > 0
# Transformation chain
my_transforms: torch.nn.Sequential = torch.nn.Sequential(
tv.transforms.CenterCrop(size=cfg.image_statistics.the_size),
)
scripted_transforms = torch.jit.script(my_transforms)
# Preprocess the input data
pattern = scripted_transforms(pattern)
gray = pattern[:, 0:1, :, :] + 1e-20
return gray
def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
"""0. The training image comes in
1. is cropped from a random position
2. returned.
This is a 1 channel version (e.g. one gray channel).
"""
assert len(cfg.image_statistics.mean) == 1
assert len(cfg.image_statistics.the_size) == 2
assert cfg.image_statistics.the_size[0] > 0
assert cfg.image_statistics.the_size[1] > 0
# Transformation chain
my_transforms: torch.nn.Sequential = torch.nn.Sequential(
tv.transforms.RandomCrop(size=cfg.image_statistics.the_size),
tv.transforms.RandomHorizontalFlip(p=cfg.augmentation.flip_p),
tv.transforms.ColorJitter(
brightness=cfg.augmentation.jitter_brightness,
contrast=cfg.augmentation.jitter_contrast,
saturation=cfg.augmentation.jitter_saturation,
hue=cfg.augmentation.jitter_hue,
),
)
scripted_transforms = torch.jit.script(my_transforms)
# Preprocess the input data
pattern = scripted_transforms(pattern)
gray = pattern[:, 0:1, :, :] + 1e-20
return gray
class DatasetCIFAR(DatasetMaster):
"""Contstructor"""
# Initialize
def __init__(
self,
train: bool = False,
path_pattern: str = "./",
path_label: str = "./",
) -> None:
super().__init__(train, path_pattern, path_label)
self.pattern_storage = np.ascontiguousarray(
np.moveaxis(self.pattern_storage.astype(dtype=np.float32), 3, 1)
)
self.pattern_storage /= np.max(self.pattern_storage)
mean = self.pattern_storage.mean(3).mean(2).mean(0)
self.mean = [*mean]
def __getitem__(self, index: int) -> tuple[torch.Tensor, int]:
image = self.pattern_storage[index, :, :, :]
target = int(self.label_storage[index])
return torch.tensor(image), target
def pattern_filter_test(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
"""0. The test image comes in
1. is center cropped
2. returned.
This is a 3 channel version (e.g. r,g,b channels).
"""
assert len(cfg.image_statistics.mean) == 3
assert len(cfg.image_statistics.the_size) == 2
assert cfg.image_statistics.the_size[0] > 0
assert cfg.image_statistics.the_size[1] > 0
# Transformation chain
my_transforms: torch.nn.Sequential = torch.nn.Sequential(
tv.transforms.CenterCrop(size=cfg.image_statistics.the_size),
)
scripted_transforms = torch.jit.script(my_transforms)
# Preprocess the input data
pattern = scripted_transforms(pattern)
r = pattern[:, 0:1, :, :] + 1e-20
g = pattern[:, 1:2, :, :] + 1e-20
b = pattern[:, 2:3, :, :] + 1e-20
new_tensor: torch.Tensor = torch.cat((r, g, b), dim=1)
return new_tensor
def pattern_filter_train(self, pattern: torch.Tensor, cfg: Config) -> torch.Tensor:
"""0. The training image comes in
1. is cropped from a random position
2. is randomly horizontally flipped
3. is randomly color jitteres
4. returned.
This is a 3 channel version (e.g. r,g,b channels).
"""
assert len(cfg.image_statistics.mean) == 3
assert len(cfg.image_statistics.the_size) == 2
assert cfg.image_statistics.the_size[0] > 0
assert cfg.image_statistics.the_size[1] > 0
# Transformation chain
my_transforms: torch.nn.Sequential = torch.nn.Sequential(
tv.transforms.RandomCrop(size=cfg.image_statistics.the_size),
tv.transforms.RandomHorizontalFlip(p=cfg.augmentation.flip_p),
tv.transforms.ColorJitter(
brightness=cfg.augmentation.jitter_brightness,
contrast=cfg.augmentation.jitter_contrast,
saturation=cfg.augmentation.jitter_saturation,
hue=cfg.augmentation.jitter_hue,
),
)
scripted_transforms = torch.jit.script(my_transforms)
# Preprocess the input data
pattern = scripted_transforms(pattern)
r = pattern[:, 0:1, :, :] + 1e-20
g = pattern[:, 1:2, :, :] + 1e-20
b = pattern[:, 2:3, :, :] + 1e-20
new_tensor: torch.Tensor = torch.cat((r, g, b), dim=1)
return new_tensor
if __name__ == "__main__":
pass

185
network/Parameter.py Normal file
View file

@ -0,0 +1,185 @@
# %%
from dataclasses import dataclass, field
import numpy as np
import torch
import os
@dataclass
class Network:
"""Parameters of the network. The details about
its layers and the number of output neurons."""
layer_type: list[str] = field(default_factory=list)
forward_neuron_numbers: list[list[int]] = field(default_factory=list)
forward_kernel_size: list[list[int]] = field(default_factory=list)
strides: list[list[int]] = field(default_factory=list)
dilation: list[list[int]] = field(default_factory=list)
padding: list[list[int]] = field(default_factory=list)
number_of_output_neurons: int = field(default=0)
@dataclass
class LearningParameters:
"""Parameter required for training"""
learning_active: bool = field(default=True)
loss_mode: int = field(default=0)
loss_coeffs_mse: float = field(default=0.5)
loss_coeffs_kldiv: float = field(default=1.0)
optimizer_name: str = field(default="Adam")
learning_rate_gamma_w: float = field(default=-1.0)
learning_rate_threshold_w: float = field(default=0.00001)
lr_schedule_name: str = field(default="ReduceLROnPlateau")
lr_scheduler_use_performance: bool = field(default=False)
lr_scheduler_factor_w: float = field(default=0.75)
lr_scheduler_patience_w: int = field(default=-1)
lr_scheduler_tau_w: int = field(default=10)
number_of_batches_for_one_update: int = field(default=1)
overload_path: str = field(default="Previous")
weight_noise_range: list[float] = field(default_factory=list)
eps_xy_intitial: float = field(default=0.1)
# disable_scale_grade: bool = field(default=False)
# kepp_last_grad_scale: bool = field(default=True)
sbs_skip_gradient_calculation: list[bool] = field(default_factory=list)
adapt_learning_rate_after_minibatch: bool = field(default=True)
w_trainable: list[bool] = field(default_factory=list)
@dataclass
class Augmentation:
"""Parameters used for data augmentation."""
crop_width_in_pixel: int = field(default=2)
flip_p: float = field(default=0.5)
jitter_brightness: float = field(default=0.5)
jitter_contrast: float = field(default=0.1)
jitter_saturation: float = field(default=0.1)
jitter_hue: float = field(default=0.15)
@dataclass
class ImageStatistics:
"""(Statistical) information about the input. i.e.
mean values and the x and y size of the input"""
mean: list[float] = field(default_factory=list)
the_size: list[int] = field(default_factory=list)
@dataclass
class ApproximationSetting:
# Approximation CONV2D Layer
approximation_enable: list[bool] = field(default_factory=list)
number_of_trunc_bits: list[int] = field(default_factory=list)
number_of_frac_bits: list[int] = field(default_factory=list)
@dataclass
class Config:
"""Master config class."""
# Sub classes
network_structure: Network = field(default_factory=Network)
learning_parameters: LearningParameters = field(default_factory=LearningParameters)
augmentation: Augmentation = field(default_factory=Augmentation)
image_statistics: ImageStatistics = field(default_factory=ImageStatistics)
approximation_setting: ApproximationSetting = field(
default_factory=ApproximationSetting
)
# For labeling simulations
# (not actively used)
simulation_id: int = field(default=0)
stage_id: int = field(default=-1)
# Size of one sub-mini-batch
# (the number of pattern processed at the same time)
batch_size: int = field(default=500)
# The data set
# Identifier for Dataset.oy
data_mode: str = field(default="")
# The path to the data set
data_path: str = field(default="")
# The epochs identifier
epoch_id: int = field(default=0)
# Maximum number of epochs
epoch_id_max: int = field(default=10000)
# Number of cpu threads
number_of_cpu_processes: int = field(default=-1)
# Adjust the number of pattern processed in
# one step to the amount of core or with HT threads
# of the cpu
enable_cpu_thread_balacing: bool = field(default=True)
# Path for storing information
weight_path: str = field(default="Parameters")
log_path: str = field(default="Log")
# Other SbS Settings
number_of_spikes: list[int] = field(default_factory=list)
cooldown_after_number_of_spikes: int = field(default=-1)
reduction_cooldown: float = field(default=25.0)
epsilon_0: float = field(default=1.0)
forgetting_offset: float = field(default=-1.0)
def __post_init__(self) -> None:
"""Post init determines the number of cores.
Creates the required directory and gives us an optimized
(for the amount of cores) batch size."""
number_of_cpu_processes_temp = os.cpu_count()
if self.number_of_cpu_processes < 1:
if number_of_cpu_processes_temp is None:
self.number_of_cpu_processes = 1
else:
self.number_of_cpu_processes = number_of_cpu_processes_temp
os.makedirs(self.weight_path, exist_ok=True)
if self.enable_cpu_thread_balacing is True:
self.batch_size = (
self.batch_size // self.number_of_cpu_processes
) * self.number_of_cpu_processes
self.batch_size = np.max((self.batch_size, self.number_of_cpu_processes))
self.batch_size = int(self.batch_size)
def get_epsilon_t(self, number_of_spikes: int):
"""Generates the time series of the basic epsilon."""
t = np.arange(0, number_of_spikes, dtype=np.float32) + 1
np_epsilon_t: np.ndarray = t ** (
-1.0 / 2.0
) # np.ones((number_of_spikes), dtype=np.float32)
if (self.cooldown_after_number_of_spikes < number_of_spikes) and (
self.cooldown_after_number_of_spikes >= 0
):
np_epsilon_t[
self.cooldown_after_number_of_spikes : number_of_spikes
] /= self.reduction_cooldown
return torch.tensor(np_epsilon_t)
def get_update_after_x_pattern(self):
"""Tells us after how many pattern we need to update the weights."""
return (
self.batch_size * self.learning_parameters.number_of_batches_for_one_update
)

714
network/SbS.py Normal file
View file

@ -0,0 +1,714 @@
import torch
from network.CPP.PySpikeGeneration2DManyIP import SpikeGeneration2DManyIP
from network.CPP.PyHDynamicCNNManyIP import HDynamicCNNManyIP
from network.calculate_output_size import calculate_output_size
class SbS(torch.nn.Module):
_epsilon_xy: torch.Tensor | None = None
_epsilon_0: float
_epsilon_t: torch.Tensor | None = None
_weights: torch.nn.parameter.Parameter
_weights_exists: bool = False
_kernel_size: list[int]
_stride: list[int]
_dilation: list[int]
_padding: list[int]
_output_size: torch.Tensor
_number_of_spikes: int
_number_of_cpu_processes: int
_number_of_neurons: int
_number_of_input_neurons: int
_epsilon_xy_intitial: float
_h_initial: torch.Tensor | None = None
_w_trainable: bool
# _last_grad_scale: torch.nn.parameter.Parameter
# _keep_last_grad_scale: bool
# _disable_scale_grade: bool
_forgetting_offset: torch.Tensor | None = None
_weight_noise_range: list[float]
_skip_gradient_calculation: bool
_is_pooling_layer: bool
_input_size: list[int]
_output_layer: bool = False
_local_learning: bool = False
device: torch.device
default_dtype: torch.dtype
_gpu_tuning_factor: int
_max_grad_weights: torch.Tensor | None = None
_number_of_grad_weight_contributions: float = 0.0
def __init__(
self,
number_of_input_neurons: int,
number_of_neurons: int,
input_size: list[int],
forward_kernel_size: list[int],
number_of_spikes: int,
epsilon_t: torch.Tensor,
epsilon_xy_intitial: float = 0.1,
epsilon_0: float = 1.0,
weight_noise_range: list[float] = [0.0, 1.0],
is_pooling_layer: bool = False,
strides: list[int] = [1, 1],
dilation: list[int] = [0, 0],
padding: list[int] = [0, 0],
number_of_cpu_processes: int = 1,
w_trainable: bool = False,
# keep_last_grad_scale: bool = False,
# disable_scale_grade: bool = True,
forgetting_offset: float = -1.0,
skip_gradient_calculation: bool = False,
device: torch.device | None = None,
default_dtype: torch.dtype | None = None,
gpu_tuning_factor: int = 5,
) -> None:
super().__init__()
assert device is not None
assert default_dtype is not None
self.device = device
self.default_dtype = default_dtype
self._w_trainable = bool(w_trainable)
# self._keep_last_grad_scale = bool(keep_last_grad_scale)
self._skip_gradient_calculation = bool(skip_gradient_calculation)
# self._disable_scale_grade = bool(disable_scale_grade)
self._epsilon_xy_intitial = float(epsilon_xy_intitial)
self._stride = strides
self._dilation = dilation
self._padding = padding
self._kernel_size = forward_kernel_size
self._number_of_input_neurons = int(number_of_input_neurons)
self._number_of_neurons = int(number_of_neurons)
self._epsilon_0 = float(epsilon_0)
self._number_of_cpu_processes = int(number_of_cpu_processes)
self._number_of_spikes = int(number_of_spikes)
self._weight_noise_range = weight_noise_range
self._is_pooling_layer = bool(is_pooling_layer)
assert len(input_size) == 2
self._input_size = input_size
# The GPU hates me...
# Too many SbS threads == bad
# Thus I need to limit them...
# (Reminder: We cannot access the mini-batch size here,
# which is part of the GPU thread size calculation...)
if (self._input_size[0] * self._input_size[1]) > gpu_tuning_factor:
self._gpu_tuning_factor = gpu_tuning_factor
else:
self._gpu_tuning_factor = 0
# self._last_grad_scale = torch.nn.parameter.Parameter(
# torch.tensor(-1.0, dtype=self.default_dtype),
# requires_grad=True,
# )
self._forgetting_offset = torch.tensor(
forgetting_offset, dtype=self.default_dtype, device=self.device
)
self.epsilon_t = epsilon_t.type(dtype=self.default_dtype).to(device=self.device)
self._output_size = calculate_output_size(
value=input_size,
kernel_size=self._kernel_size,
stride=self._stride,
dilation=self._dilation,
padding=self._padding,
)
self.set_h_init_to_uniform()
self.functional_sbs = FunctionalSbS.apply
# ###############################################################
# Initialize the weights
# ###############################################################
if self._is_pooling_layer is True:
self.weights = self._make_pooling_weights()
else:
assert len(self._weight_noise_range) == 2
weights = torch.empty(
(
int(self._kernel_size[0])
* int(self._kernel_size[1])
* int(self._number_of_input_neurons),
int(self._number_of_neurons),
),
dtype=self.default_dtype,
device=self.device,
)
torch.nn.init.uniform_(
weights,
a=float(self._weight_noise_range[0]),
b=float(self._weight_noise_range[1]),
)
self.weights = weights
####################################################################
# Variables in and out #
####################################################################
@property
def epsilon_t(self) -> torch.Tensor | None:
return self._epsilon_t
@epsilon_t.setter
def epsilon_t(self, value: torch.Tensor):
assert value is not None
assert torch.is_tensor(value) is True
assert value.dim() == 1
assert value.dtype == self.default_dtype
self._epsilon_t = (
value.detach()
.clone(memory_format=torch.contiguous_format)
.type(dtype=self.default_dtype)
.to(device=self.device)
.requires_grad_(False)
)
@property
def weights(self) -> torch.Tensor | None:
if self._weights_exists is False:
return None
else:
return self._weights
@weights.setter
def weights(self, value: torch.Tensor):
assert value is not None
assert torch.is_tensor(value) is True
assert value.dim() == 2
temp: torch.Tensor = (
value.detach()
.clone(memory_format=torch.contiguous_format)
.type(dtype=self.default_dtype)
.to(device=self.device)
)
temp /= temp.sum(dim=0, keepdim=True, dtype=self.default_dtype)
if self._weights_exists is False:
self._weights = torch.nn.parameter.Parameter(temp, requires_grad=True)
self._weights_exists = True
else:
self._weights.data = temp
@property
def h_initial(self) -> torch.Tensor | None:
return self._h_initial
@h_initial.setter
def h_initial(self, value: torch.Tensor):
assert value is not None
assert torch.is_tensor(value) is True
assert value.dim() == 1
assert value.dtype == self.default_dtype
self._h_initial = (
value.detach()
.clone(memory_format=torch.contiguous_format)
.type(dtype=self.default_dtype)
.to(device=self.device)
.requires_grad_(False)
)
def update_pre_care(self):
if self._weights.grad is not None:
assert self._number_of_grad_weight_contributions > 0
self._weights.grad /= self._number_of_grad_weight_contributions
self._number_of_grad_weight_contributions = 0.0
def update_after_care(self, threshold_weight: float):
if self._w_trainable is True:
self.norm_weights()
self.threshold_weights(threshold_weight)
self.norm_weights()
# def after_batch(self, new_state: bool = False):
# if self._keep_last_grad_scale is True:
# self._last_grad_scale.data = self._last_grad_scale.grad
# self._keep_last_grad_scale = new_state
# self._last_grad_scale.grad = torch.zeros_like(self._last_grad_scale.grad)
####################################################################
# Helper functions #
####################################################################
def _make_pooling_weights(self) -> torch.Tensor:
"""For generating the pooling weights."""
assert self._number_of_neurons is not None
assert self._kernel_size is not None
weights: torch.Tensor = torch.zeros(
(
int(self._kernel_size[0]),
int(self._kernel_size[1]),
int(self._number_of_neurons),
int(self._number_of_neurons),
),
dtype=self.default_dtype,
device=self.device,
)
for i in range(0, int(self._number_of_neurons)):
weights[:, :, i, i] = 1.0
weights = weights.moveaxis(-1, 0).moveaxis(-1, 1)
weights = torch.nn.functional.unfold(
input=weights,
kernel_size=(int(self._kernel_size[0]), int(self._kernel_size[1])),
dilation=(1, 1),
padding=(0, 0),
stride=(1, 1),
).squeeze()
weights = torch.moveaxis(weights, 0, 1)
return weights
def set_h_init_to_uniform(self) -> None:
assert self._number_of_neurons > 2
self.h_initial: torch.Tensor = torch.full(
(self._number_of_neurons,),
(1.0 / float(self._number_of_neurons)),
dtype=self.default_dtype,
device=self.device,
)
def norm_weights(self) -> None:
assert self._weights_exists is True
temp: torch.Tensor = (
self._weights.data.detach()
.clone(memory_format=torch.contiguous_format)
.type(dtype=self.default_dtype)
.to(device=self.device)
)
temp /= temp.sum(dim=0, keepdim=True, dtype=self.default_dtype)
self._weights.data = temp
def threshold_weights(self, threshold: float) -> None:
assert self._weights_exists is True
assert threshold >= 0
torch.clamp(
self._weights.data,
min=float(threshold),
max=None,
out=self._weights.data,
)
####################################################################
# Forward #
####################################################################
def forward(
self, input: torch.Tensor, labels: torch.Tensor | None = None
) -> torch.Tensor:
# Are we happy with the input?
assert input is not None
assert torch.is_tensor(input) is True
assert input.dim() == 4
assert input.dtype == self.default_dtype
assert input.shape[1] == self._number_of_input_neurons
assert input.shape[2] == self._input_size[0]
assert input.shape[3] == self._input_size[1]
# Are we happy with the rest of the network?
assert self._epsilon_0 is not None
assert self._epsilon_t is not None
assert self._h_initial is not None
assert self._forgetting_offset is not None
assert self._weights_exists is True
assert self._weights is not None
input_convolved = torch.nn.functional.fold(
torch.nn.functional.unfold(
input.requires_grad_(True),
kernel_size=(int(self._kernel_size[0]), int(self._kernel_size[1])),
dilation=(int(self._dilation[0]), int(self._dilation[1])),
padding=(int(self._padding[0]), int(self._padding[1])),
stride=(int(self._stride[0]), int(self._stride[1])),
),
output_size=tuple(self._output_size.tolist()),
kernel_size=(1, 1),
dilation=(1, 1),
padding=(0, 0),
stride=(1, 1),
)
epsilon_t_0: torch.Tensor = (
(self._epsilon_t * self._epsilon_0).type(input.dtype).to(input.device)
)
parameter_list = torch.tensor(
[
int(self._w_trainable), # 0
int(0), # int(self._disable_scale_grade), # 1
int(0), # int(self._keep_last_grad_scale), # 2
int(self._skip_gradient_calculation), # 3
int(self._number_of_spikes), # 4
int(self._number_of_cpu_processes), # 5
int(self._output_size[0]), # 6
int(self._output_size[1]), # 7
int(self._gpu_tuning_factor), # 8
int(self._output_layer), # 9
int(self._local_learning), # 10
],
dtype=torch.int64,
)
if self._epsilon_xy is None:
self._epsilon_xy = torch.full(
(
input_convolved.shape[1],
input_convolved.shape[2],
input_convolved.shape[3],
),
float(self._epsilon_xy_intitial),
dtype=self.default_dtype,
device=self.device,
)
assert self._epsilon_xy is not None
# In the case somebody tried to replace the matrix with wrong dimensions
assert self._epsilon_xy.shape[0] == input_convolved.shape[1]
assert self._epsilon_xy.shape[1] == input_convolved.shape[2]
assert self._epsilon_xy.shape[2] == input_convolved.shape[3]
# SbS forward functional
output = self.functional_sbs(
input_convolved,
self._epsilon_xy,
epsilon_t_0,
self._weights,
self._h_initial,
parameter_list,
# self._last_grad_scale,
self._forgetting_offset,
)
self._number_of_grad_weight_contributions += (
output.shape[0] * output.shape[-2] * output.shape[-1]
)
return output
class FunctionalSbS(torch.autograd.Function):
@staticmethod
def forward( # type: ignore
ctx,
input: torch.Tensor,
epsilon_xy: torch.Tensor,
epsilon_t_0: torch.Tensor,
weights: torch.Tensor,
h_initial: torch.Tensor,
parameter_list: torch.Tensor,
# grad_output_scale: torch.Tensor,
forgetting_offset: torch.Tensor,
) -> torch.Tensor:
assert input.dim() == 4
number_of_spikes: int = int(parameter_list[4])
if input.device == torch.device("cpu"):
spike_number_of_cpu_processes: int = int(parameter_list[5])
else:
spike_number_of_cpu_processes = -1
if input.device == torch.device("cpu"):
hdyn_number_of_cpu_processes: int = int(parameter_list[5])
else:
hdyn_number_of_cpu_processes = -1
output_size_0: int = int(parameter_list[6])
output_size_1: int = int(parameter_list[7])
gpu_tuning_factor: int = int(parameter_list[8])
# ###########################################################
# Spike generation
# ###########################################################
# ############################################
# Normalized cumsum
# (beware of the pytorch bug! Thus .clone()!)
# ############################################
input_cumsum: torch.Tensor = torch.cumsum(input, dim=1, dtype=input.dtype)
input_cumsum_last: torch.Tensor = input_cumsum[:, -1, :, :].unsqueeze(1).clone()
input_cumsum /= input_cumsum_last
# ############################################
# Get the required random numbers
# ############################################
random_values = torch.rand(
size=[
input_cumsum.shape[0],
number_of_spikes,
input_cumsum.shape[2],
input_cumsum.shape[3],
],
dtype=input.dtype,
device=input.device,
)
# ############################################
# Make space for the results
# ############################################
spikes = torch.empty_like(random_values, dtype=torch.int64, device=input.device)
assert input_cumsum.is_contiguous() is True
assert random_values.is_contiguous() is True
assert spikes.is_contiguous() is True
# time_start: float = time.perf_counter()
spike_generation: SpikeGeneration2DManyIP = SpikeGeneration2DManyIP()
spike_generation.spike_generation(
input_cumsum.data_ptr(),
int(input_cumsum.shape[0]),
int(input_cumsum.shape[1]),
int(input_cumsum.shape[2]),
int(input_cumsum.shape[3]),
random_values.data_ptr(),
int(random_values.shape[0]),
int(random_values.shape[1]),
int(random_values.shape[2]),
int(random_values.shape[3]),
spikes.data_ptr(),
int(spikes.shape[0]),
int(spikes.shape[1]),
int(spikes.shape[2]),
int(spikes.shape[3]),
int(spike_number_of_cpu_processes),
)
del random_values
del input_cumsum
# ###########################################################
# H dynamic
# ###########################################################
assert epsilon_t_0.ndim == 1
assert epsilon_t_0.shape[0] >= number_of_spikes
# ############################################
# Make space for the results
# ############################################
output = torch.empty(
(
int(input.shape[0]),
int(weights.shape[1]),
output_size_0,
output_size_1,
),
dtype=input.dtype,
device=input.device,
)
assert output.is_contiguous() is True
assert epsilon_xy.is_contiguous() is True
assert epsilon_t_0.is_contiguous() is True
assert weights.is_contiguous() is True
assert spikes.is_contiguous() is True
assert h_initial.is_contiguous() is True
assert epsilon_xy.ndim == 3
assert weights.ndim == 2
assert h_initial.ndim == 1
h_dynamic: HDynamicCNNManyIP = HDynamicCNNManyIP()
h_dynamic.update(
output.data_ptr(),
int(output.shape[0]),
int(output.shape[1]),
int(output.shape[2]),
int(output.shape[3]),
epsilon_xy.data_ptr(),
int(epsilon_xy.shape[0]),
int(epsilon_xy.shape[1]),
int(epsilon_xy.shape[2]),
epsilon_t_0.data_ptr(),
int(epsilon_t_0.shape[0]),
weights.data_ptr(),
int(weights.shape[0]),
int(weights.shape[1]),
spikes.data_ptr(),
int(spikes.shape[0]),
int(spikes.shape[1]),
int(spikes.shape[2]),
int(spikes.shape[3]),
h_initial.data_ptr(),
int(h_initial.shape[0]),
hdyn_number_of_cpu_processes,
float(forgetting_offset.item()),
int(gpu_tuning_factor),
)
del spikes
# ###########################################################
# Save the necessary data for the backward pass
# ###########################################################
ctx.save_for_backward(
input,
weights,
output,
parameter_list,
# grad_output_scale,
)
return output
@staticmethod
def backward(ctx, grad_output):
# ##############################################
# Get the variables back
# ##############################################
(
input,
weights,
output,
parameter_list,
# last_grad_scale,
) = ctx.saved_tensors
# ##############################################
# Default output
# ##############################################
grad_input = None
grad_eps_xy = None
grad_epsilon_t_0 = None
grad_weights = None
grad_h_initial = None
grad_parameter_list = None
grad_forgetting_offset = None
# ##############################################
# Parameters
# ##############################################
parameter_w_trainable: bool = bool(parameter_list[0])
# parameter_disable_scale_grade: bool = bool(parameter_list[1])
# parameter_keep_last_grad_scale: bool = bool(parameter_list[2])
parameter_skip_gradient_calculation: bool = bool(parameter_list[3])
parameter_output_layer: bool = bool(parameter_list[9])
parameter_local_learning: bool = bool(parameter_list[10])
# ##############################################
# Dealing with overall scale of the gradient
# ##############################################
# if parameter_disable_scale_grade is False:
# if parameter_keep_last_grad_scale is True:
# last_grad_scale = torch.tensor(
# [torch.abs(grad_output).max(), last_grad_scale]
# ).max()
# grad_output /= last_grad_scale
# grad_output_scale = last_grad_scale.clone()
input /= input.sum(dim=1, keepdim=True, dtype=weights.dtype)
# #################################################
# User doesn't want us to calculate the gradients
# #################################################
if parameter_skip_gradient_calculation is True:
return (
grad_input,
grad_eps_xy,
grad_epsilon_t_0,
grad_weights,
grad_h_initial,
grad_parameter_list,
# grad_output_scale,
grad_forgetting_offset,
)
# #################################################
# Calculate backprop error (grad_input)
# #################################################
backprop_r: torch.Tensor = weights.unsqueeze(0).unsqueeze(-1).unsqueeze(
-1
) * output.unsqueeze(1)
backprop_bigr: torch.Tensor = backprop_r.sum(dim=2)
backprop_z: torch.Tensor = backprop_r * (
1.0 / (backprop_bigr + 1e-20)
).unsqueeze(2)
grad_input: torch.Tensor = (backprop_z * grad_output.unsqueeze(1)).sum(2)
del backprop_z
# #################################################
# Calculate weight gradient (grad_weights)
# #################################################
if parameter_w_trainable is False:
# #################################################
# We don't train this weight
# #################################################
grad_weights = None
elif (parameter_output_layer is False) and (parameter_local_learning is True):
# #################################################
# Local learning
# #################################################
grad_weights = (
(-2 * (input - backprop_bigr).unsqueeze(2) * output.unsqueeze(1))
.sum(0)
.sum(-1)
.sum(-1)
)
else:
# #################################################
# Backprop
# #################################################
backprop_f: torch.Tensor = output.unsqueeze(1) * (
input / (backprop_bigr**2 + 1e-20)
).unsqueeze(2)
result_omega: torch.Tensor = backprop_bigr.unsqueeze(
2
) * grad_output.unsqueeze(1)
result_omega -= (backprop_r * grad_output.unsqueeze(1)).sum(2).unsqueeze(2)
result_omega *= backprop_f
del backprop_f
grad_weights = result_omega.sum(0).sum(-1).sum(-1)
del result_omega
del backprop_bigr
del backprop_r
return (
grad_input,
grad_eps_xy,
grad_epsilon_t_0,
grad_weights,
grad_h_initial,
grad_parameter_list,
# grad_output_scale,
grad_forgetting_offset,
)

106
network/SbSLRScheduler.py Normal file
View file

@ -0,0 +1,106 @@
import torch
class SbSLRScheduler(torch.optim.lr_scheduler.ReduceLROnPlateau):
def __init__(
self,
optimizer,
mode: str = "min",
factor: float = 0.1,
patience: int = 10,
threshold: float = 1e-4,
threshold_mode: str = "rel",
cooldown: int = 0,
min_lr: float = 0,
eps: float = 1e-8,
verbose: bool = False,
tau: float = 10,
) -> None:
super().__init__(
optimizer=optimizer,
mode=mode,
factor=factor,
patience=patience,
threshold=threshold,
threshold_mode=threshold_mode,
cooldown=cooldown,
min_lr=min_lr,
eps=eps,
verbose=verbose,
)
self.lowpass_tau: float = tau
self.lowpass_decay_value: float = 1.0 - (1.0 / self.lowpass_tau)
self.lowpass_number_of_steps: int = 0
self.loss_maximum_over_time: float | None = None
self.lowpass_memory: float = 0.0
self.lowpass_learning_rate_minimum_over_time: float | None = None
self.lowpass_learning_rate_minimum_over_time_past_step: float | None = None
self.previous_learning_rate: float | None = None
self.loss_normalized_past_step: float | None = None
def step(self, metrics, epoch=None) -> None:
loss = float(metrics)
if self.loss_maximum_over_time is None:
self.loss_maximum_over_time = loss
if self.loss_normalized_past_step is None:
self.loss_normalized_past_step = loss / self.loss_maximum_over_time
if self.previous_learning_rate is None:
self.previous_learning_rate = self.optimizer.param_groups[-1]["lr"] # type: ignore
# The parent lr scheduler controlls the basic learn rate
self.previous_learning_rate = self.optimizer.param_groups[-1]["lr"] # type: ignore
super().step(metrics=self.loss_normalized_past_step, epoch=epoch)
# If the parent changes the base learning rate,
# then we reset the adaptive part
if self.optimizer.param_groups[-1]["lr"] != self.previous_learning_rate: # type: ignore
self.previous_learning_rate = self.optimizer.param_groups[-1]["lr"] # type: ignore
self.lowpass_number_of_steps = 0
self.loss_maximum_over_time = None
self.lowpass_memory = 0.0
self.lowpass_learning_rate_minimum_over_time = None
self.lowpass_learning_rate_minimum_over_time_past_step = None
if self.loss_maximum_over_time is None:
self.loss_maximum_over_time = loss
else:
self.loss_maximum_over_time = max(self.loss_maximum_over_time, loss)
self.lowpass_number_of_steps += 1
self.lowpass_memory = self.lowpass_memory * self.lowpass_decay_value + (
loss / self.loss_maximum_over_time
) * (1.0 / self.lowpass_tau)
loss_normalized: float = self.lowpass_memory / (
1.0 - self.lowpass_decay_value ** float(self.lowpass_number_of_steps)
)
if self.lowpass_learning_rate_minimum_over_time is None:
self.lowpass_learning_rate_minimum_over_time = loss_normalized
else:
self.lowpass_learning_rate_minimum_over_time = min(
self.lowpass_learning_rate_minimum_over_time, loss_normalized
)
if self.lowpass_learning_rate_minimum_over_time_past_step is None:
self.lowpass_learning_rate_minimum_over_time_past_step = (
self.lowpass_learning_rate_minimum_over_time
)
self.optimizer.param_groups[-1]["lr"] *= ( # type: ignore
self.lowpass_learning_rate_minimum_over_time
/ self.lowpass_learning_rate_minimum_over_time_past_step
)
self.lowpass_learning_rate_minimum_over_time_past_step = (
self.lowpass_learning_rate_minimum_over_time
)
self.loss_normalized_past_step = loss_normalized

View file

@ -0,0 +1,54 @@
import torch
class SplitOnOffLayer(torch.nn.Module):
device: torch.device
default_dtype: torch.dtype
mean: torch.Tensor | None = None
epsilon: float = 0.01
def __init__(
self,
device: torch.device | None = None,
default_dtype: torch.dtype | None = None,
) -> None:
super().__init__()
assert device is not None
assert default_dtype is not None
self.device = device
self.default_dtype = default_dtype
####################################################################
# Forward #
####################################################################
def forward(self, input: torch.Tensor) -> torch.Tensor:
assert input.ndim == 4
# self.training is switched by network.eval() and network.train()
if self.training is True:
mean_temp = (
input.mean(dim=0, keepdim=True)
.mean(dim=1, keepdim=True)
.detach()
.clone()
)
if self.mean is None:
self.mean = mean_temp
else:
self.mean = (1.0 - self.epsilon) * self.mean + self.epsilon * mean_temp
assert self.mean is not None
temp = input - self.mean.detach().clone()
temp_a = torch.nn.functional.relu(temp)
temp_b = torch.nn.functional.relu(-temp)
output = torch.cat((temp_a, temp_b), dim=1)
output /= output.sum(dim=1, keepdim=True) + 1e-20
return output

68
network/build_datasets.py Normal file
View file

@ -0,0 +1,68 @@
# %%
import torch
from network.Dataset import (
DatasetMaster,
DatasetCIFAR,
DatasetMNIST,
DatasetFashionMNIST,
)
from network.Parameter import Config
def build_datasets(
cfg: Config,
) -> tuple[
DatasetMaster,
DatasetMaster,
torch.utils.data.DataLoader,
torch.utils.data.DataLoader,
]:
# Load the input data
the_dataset_train: DatasetMaster
the_dataset_test: DatasetMaster
if cfg.data_mode == "CIFAR10":
the_dataset_train = DatasetCIFAR(
train=True, path_pattern=cfg.data_path, path_label=cfg.data_path
)
the_dataset_test = DatasetCIFAR(
train=False, path_pattern=cfg.data_path, path_label=cfg.data_path
)
elif cfg.data_mode == "MNIST":
the_dataset_train = DatasetMNIST(
train=True, path_pattern=cfg.data_path, path_label=cfg.data_path
)
the_dataset_test = DatasetMNIST(
train=False, path_pattern=cfg.data_path, path_label=cfg.data_path
)
elif cfg.data_mode == "MNIST_FASHION":
the_dataset_train = DatasetFashionMNIST(
train=True, path_pattern=cfg.data_path, path_label=cfg.data_path
)
the_dataset_test = DatasetFashionMNIST(
train=False, path_pattern=cfg.data_path, path_label=cfg.data_path
)
else:
raise Exception("data_mode unknown")
if len(cfg.image_statistics.mean) == 0:
cfg.image_statistics.mean = the_dataset_train.mean
# The basic size
cfg.image_statistics.the_size = [
the_dataset_train.pattern_storage.shape[2],
the_dataset_train.pattern_storage.shape[3],
]
# Minus the stuff we cut away in the pattern filter
cfg.image_statistics.the_size[0] -= 2 * cfg.augmentation.crop_width_in_pixel
cfg.image_statistics.the_size[1] -= 2 * cfg.augmentation.crop_width_in_pixel
my_loader_test: torch.utils.data.DataLoader = torch.utils.data.DataLoader(
the_dataset_test, batch_size=cfg.batch_size, shuffle=False
)
my_loader_train: torch.utils.data.DataLoader = torch.utils.data.DataLoader(
the_dataset_train, batch_size=cfg.batch_size, shuffle=True
)
return the_dataset_train, the_dataset_test, my_loader_test, my_loader_train

View file

@ -0,0 +1,86 @@
# %%
import torch
from network.Parameter import Config
try:
from network.SbSLRScheduler import SbSLRScheduler
sbs_lr_scheduler: bool = True
except Exception:
sbs_lr_scheduler = False
def build_lr_scheduler(
optimizer, cfg: Config, logging
) -> list[torch.optim.lr_scheduler.ReduceLROnPlateau | SbSLRScheduler | None]:
assert len(optimizer) > 0
lr_scheduler_list: list[
torch.optim.lr_scheduler.ReduceLROnPlateau | SbSLRScheduler | None
] = []
for id_optimizer in range(0, len(optimizer)):
if cfg.learning_parameters.lr_schedule_name == "None":
logging.info(f"Using lr scheduler for optimizer {id_optimizer} : None")
lr_scheduler_list.append(None)
elif cfg.learning_parameters.lr_schedule_name == "ReduceLROnPlateau":
logging.info(
f"Using lr scheduler for optimizer {id_optimizer}: ReduceLROnPlateau"
)
if optimizer[id_optimizer] is None:
lr_scheduler_list.append(None)
elif (cfg.learning_parameters.lr_scheduler_factor_w <= 0) or (
cfg.learning_parameters.lr_scheduler_patience_w <= 0
):
lr_scheduler_list.append(
torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer[id_optimizer],eps=1e-14,
)
)
else:
lr_scheduler_list.append(
torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer[id_optimizer],
factor=cfg.learning_parameters.lr_scheduler_factor_w,
patience=cfg.learning_parameters.lr_scheduler_patience_w,
eps=1e-14,
)
)
elif cfg.learning_parameters.lr_schedule_name == "SbSLRScheduler":
logging.info(
f"Using lr scheduler for optimizer {id_optimizer}: SbSLRScheduler"
)
if sbs_lr_scheduler is False:
raise Exception(
f"lr_scheduler for optimizer {id_optimizer}: SbSLRScheduler.py missing"
)
if optimizer[id_optimizer] is None:
lr_scheduler_list.append(None)
elif (
(cfg.learning_parameters.lr_scheduler_factor_w <= 0)
or (cfg.learning_parameters.lr_scheduler_patience_w <= 0)
or (cfg.learning_parameters.lr_scheduler_tau_w <= 0)
):
lr_scheduler_list.append(None)
else:
lr_scheduler_list.append(
SbSLRScheduler(
optimizer[id_optimizer],
factor=cfg.learning_parameters.lr_scheduler_factor_w,
patience=cfg.learning_parameters.lr_scheduler_patience_w,
tau=cfg.learning_parameters.lr_scheduler_tau_w,
)
)
else:
raise Exception("lr_scheduler not implemented")
return lr_scheduler_list

354
network/build_network.py Normal file
View file

@ -0,0 +1,354 @@
# %%
import torch
from network.calculate_output_size import calculate_output_size
from network.Parameter import Config
from network.SbS import SbS
from network.SplitOnOffLayer import SplitOnOffLayer
from network.Conv2dApproximation import Conv2dApproximation
def build_network(
cfg: Config, device: torch.device, default_dtype: torch.dtype, logging
) -> torch.nn.Sequential:
network = torch.nn.Sequential()
input_size: list[list[int]] = []
input_size.append(cfg.image_statistics.the_size)
for layer_id in range(0, len(cfg.network_structure.layer_type)):
# #############################################################
# Show infos about the layer:
# #############################################################
logging.info("")
logging.info(f"Layer ID: {layer_id}")
logging.info(f"Layer type: {cfg.network_structure.layer_type[layer_id]}")
# #############################################################
# Fill in the default values
# #############################################################
kernel_size: list[int] = [1, 1]
if len(cfg.network_structure.forward_kernel_size) > layer_id:
kernel_size = cfg.network_structure.forward_kernel_size[layer_id]
padding: list[int] = [0, 0]
if len(cfg.network_structure.padding) > layer_id:
padding = cfg.network_structure.padding[layer_id]
dilation: list[int] = [1, 1]
if len(cfg.network_structure.dilation) > layer_id:
dilation = cfg.network_structure.dilation[layer_id]
strides: list[int] = [1, 1]
if len(cfg.network_structure.strides) > layer_id:
if len(cfg.network_structure.strides[layer_id]) == 2:
strides = cfg.network_structure.strides[layer_id]
in_channels: int = -1
out_channels: int = -1
if len(cfg.network_structure.forward_neuron_numbers) > layer_id:
if len(cfg.network_structure.forward_neuron_numbers[layer_id]) == 2:
in_channels = cfg.network_structure.forward_neuron_numbers[layer_id][0]
out_channels = cfg.network_structure.forward_neuron_numbers[layer_id][1]
weight_noise_range: list[float] = [1.0, 1.1]
if len(cfg.learning_parameters.weight_noise_range) == 2:
weight_noise_range = [
float(cfg.learning_parameters.weight_noise_range[0]),
float(cfg.learning_parameters.weight_noise_range[1]),
]
logging.info(f"Input channels: {in_channels}")
logging.info(f"Output channels: {out_channels}")
logging.info(f"Kernel size: {kernel_size}")
logging.info(f"Stride: {strides}")
logging.info(f"Dilation: {dilation}")
logging.info(f"Padding: {padding}")
# Conv2D
bias: bool = True
# Approx settings
approximation_enable: bool = False
if len(cfg.approximation_setting.approximation_enable) > layer_id:
approximation_enable = cfg.approximation_setting.approximation_enable[
layer_id
]
logging.info(f"Approximation Enable: {approximation_enable}")
elif len(cfg.approximation_setting.approximation_enable) == 1:
approximation_enable = cfg.approximation_setting.approximation_enable[0]
logging.info(f"Approximation Enable: {approximation_enable}")
number_of_trunc_bits: int = -1
if len(cfg.approximation_setting.number_of_trunc_bits) > layer_id:
number_of_trunc_bits = cfg.approximation_setting.number_of_trunc_bits[
layer_id
]
logging.info(f"Number of trunc bits: {number_of_trunc_bits}")
elif len(cfg.approximation_setting.number_of_trunc_bits) == 1:
number_of_trunc_bits = cfg.approximation_setting.number_of_trunc_bits[0]
logging.info(f"Number of trunc bits: {number_of_trunc_bits}")
number_of_frac_bits: int = -1
if len(cfg.approximation_setting.number_of_frac_bits) > layer_id:
number_of_frac_bits = cfg.approximation_setting.number_of_frac_bits[
layer_id
]
logging.info(f"Number of frac bits: {number_of_trunc_bits}")
elif len(cfg.approximation_setting.number_of_frac_bits) == 1:
number_of_frac_bits = cfg.approximation_setting.number_of_frac_bits[0]
logging.info(f"Number of frac bits: {number_of_trunc_bits}")
# Weights: Trainable?
w_trainable: bool = False
if len(cfg.learning_parameters.w_trainable) > layer_id:
w_trainable = cfg.learning_parameters.w_trainable[layer_id]
elif len(cfg.learning_parameters.w_trainable) == 1:
w_trainable = cfg.learning_parameters.w_trainable[0]
logging.info(f"W trainable?: {w_trainable}")
# SbS Setting
sbs_skip_gradient_calculation: bool = False
if len(cfg.learning_parameters.sbs_skip_gradient_calculation) > layer_id:
sbs_skip_gradient_calculation = (
cfg.learning_parameters.sbs_skip_gradient_calculation[layer_id]
)
elif len(cfg.learning_parameters.sbs_skip_gradient_calculation) == 1:
sbs_skip_gradient_calculation = (
cfg.learning_parameters.sbs_skip_gradient_calculation[0]
)
# #############################################################
# SbS layer:
# #############################################################
if cfg.network_structure.layer_type[layer_id].upper().startswith("SBS") is True:
assert in_channels > 0
assert out_channels > 0
number_of_spikes: int = -1
if len(cfg.number_of_spikes) > layer_id:
number_of_spikes = cfg.number_of_spikes[layer_id]
elif len(cfg.number_of_spikes) == 1:
number_of_spikes = cfg.number_of_spikes[0]
assert number_of_spikes > 0
logging.info(
f"Layer: {layer_id} -> SbS Layer with {number_of_spikes} spikes"
)
is_pooling_layer: bool = False
if cfg.network_structure.layer_type[layer_id].upper().find("POOLING") != -1:
is_pooling_layer = True
network.append(
SbS(
number_of_input_neurons=in_channels,
number_of_neurons=out_channels,
input_size=input_size[-1],
forward_kernel_size=kernel_size,
number_of_spikes=number_of_spikes,
epsilon_t=cfg.get_epsilon_t(number_of_spikes),
epsilon_xy_intitial=cfg.learning_parameters.eps_xy_intitial,
epsilon_0=cfg.epsilon_0,
weight_noise_range=weight_noise_range,
is_pooling_layer=is_pooling_layer,
strides=strides,
dilation=dilation,
padding=padding,
number_of_cpu_processes=cfg.number_of_cpu_processes,
w_trainable=w_trainable,
# keep_last_grad_scale=cfg.learning_parameters.kepp_last_grad_scale,
# disable_scale_grade=cfg.learning_parameters.disable_scale_grade,
forgetting_offset=cfg.forgetting_offset,
skip_gradient_calculation=sbs_skip_gradient_calculation,
device=device,
default_dtype=default_dtype,
)
)
# Adding the x,y output dimensions
input_size.append(network[-1]._output_size.tolist())
network[-1]._output_layer = False
if layer_id == len(cfg.network_structure.layer_type) - 1:
network[-1]._output_layer = True
network[-1]._local_learning = False
if cfg.network_structure.layer_type[layer_id].upper().find("LOCAL") != -1:
network[-1]._local_learning = True
# #############################################################
# Split On Off Layer:
# #############################################################
elif (
cfg.network_structure.layer_type[layer_id].upper().startswith("ONOFF")
is True
):
logging.info(f"Layer: {layer_id} -> Split On Off Layer")
network.append(
SplitOnOffLayer(
device=device,
default_dtype=default_dtype,
)
)
input_size.append(input_size[-1])
# #############################################################
# PyTorch CONV2D layer:
# #############################################################
elif (
cfg.network_structure.layer_type[layer_id].upper().startswith("CONV2D")
is True
):
assert in_channels > 0
assert out_channels > 0
logging.info(f"Layer: {layer_id} -> CONV2D Layer")
network.append(
torch.nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
stride=(int(strides[0]), int(strides[1])),
dilation=(int(dilation[0]), int(dilation[1])),
bias=bias,
padding=(int(padding[0]), int(padding[1])),
device=device,
dtype=default_dtype,
)
)
# I need this later...
network[-1]._w_trainable = w_trainable
# Calculate the x,y output dimensions
input_size_temp = calculate_output_size(
value=input_size[-1],
kernel_size=kernel_size,
stride=strides,
dilation=dilation,
padding=padding,
).tolist()
input_size.append(input_size_temp)
# #############################################################
# PyTorch RELU layer:
# #############################################################
elif (
cfg.network_structure.layer_type[layer_id].upper().startswith("RELU")
is True
):
logging.info(f"Layer: {layer_id} -> RELU Layer")
network.append(torch.nn.ReLU())
input_size.append(input_size[-1])
# #############################################################
# PyTorch MAX Pooling layer:
# #############################################################
elif (
cfg.network_structure.layer_type[layer_id].upper().startswith("MAX POOLING")
is True
):
logging.info(f"Layer: {layer_id} -> MAX POOLING Layer")
network.append(
torch.nn.MaxPool2d(
kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
stride=(int(strides[0]), int(strides[1])),
padding=(int(padding[0]), int(padding[1])),
dilation=(int(dilation[0]), int(dilation[1])),
)
)
# Calculate the x,y output dimensions
input_size_temp = calculate_output_size(
value=input_size[-1],
kernel_size=kernel_size,
stride=strides,
dilation=dilation,
padding=padding,
).tolist()
input_size.append(input_size_temp)
# #############################################################
# PyTorch Average Pooling layer:
# #############################################################
elif (
cfg.network_structure.layer_type[layer_id]
.upper()
.startswith("AVERAGE POOLING")
is True
):
logging.info(f"Layer: {layer_id} -> AVERAGE POOLING Layer")
network.append(
torch.nn.AvgPool2d(
kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
stride=(int(strides[0]), int(strides[1])),
padding=(int(padding[0]), int(padding[1])),
)
)
# Calculate the x,y output dimensions
input_size_temp = calculate_output_size(
value=input_size[-1],
kernel_size=kernel_size,
stride=strides,
dilation=dilation,
padding=padding,
).tolist()
input_size.append(input_size_temp)
# #############################################################
# Approx CONV2D layer:
# #############################################################
elif (
cfg.network_structure.layer_type[layer_id]
.upper()
.startswith("APPROX CONV2D")
is True
):
assert in_channels > 0
assert out_channels > 0
logging.info(f"Layer: {layer_id} -> Approximation CONV2D Layer")
network.append(
Conv2dApproximation(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(int(kernel_size[0]), int(kernel_size[1])),
stride=(int(strides[0]), int(strides[1])),
dilation=(int(dilation[0]), int(dilation[1])),
bias=bias,
padding=(int(padding[0]), int(padding[1])),
device=device,
dtype=default_dtype,
approximation_enable=approximation_enable,
number_of_trunc_bits=number_of_trunc_bits,
number_of_frac=number_of_frac_bits,
number_of_processes=cfg.number_of_cpu_processes,
)
)
# I need this later...
network[-1]._w_trainable = w_trainable
# Calculate the x,y output dimensions
input_size_temp = calculate_output_size(
value=input_size[-1],
kernel_size=kernel_size,
stride=strides,
dilation=dilation,
padding=padding,
).tolist()
input_size.append(input_size_temp)
# #############################################################
# Failure becaue we didn't found the selection of layer
# #############################################################
else:
raise Exception(
f"Unknown layer type: {cfg.network_structure.layer_type[layer_id]}"
)
return network

View file

@ -0,0 +1,83 @@
# %%
import torch
from network.Parameter import Config
from network.SbS import SbS
from network.Conv2dApproximation import Conv2dApproximation
from network.Adam import Adam
def build_optimizer(
network: torch.nn.Sequential, cfg: Config, logging
) -> list[torch.optim.Optimizer | None]:
parameter_list_weights: list = []
parameter_list_sbs: list = []
# ###############################################
# Put all parameter that needs to be learned
# in a parameter list.
# ###############################################
for id in range(0, len(network)):
if (isinstance(network[id], SbS) is True) and (
network[id]._w_trainable is True
):
parameter_list_weights.append(network[id]._weights)
parameter_list_sbs.append(True)
if (isinstance(network[id], torch.nn.modules.conv.Conv2d) is True) and (
network[id]._w_trainable is True
):
for id_parameter in network[id].parameters():
parameter_list_weights.append(id_parameter)
parameter_list_sbs.append(False)
if (isinstance(network[id], Conv2dApproximation) is True) and (
network[id]._w_trainable is True
):
for id_parameter in network[id].parameters():
parameter_list_weights.append(id_parameter)
parameter_list_sbs.append(False)
logging.info(
f"Number of parameters found to optimize: {len(parameter_list_weights)}"
)
# ###############################################
# Connect the parameters to an optimizer
# ###############################################
if cfg.learning_parameters.optimizer_name == "Adam":
logging.info("Using optimizer: Adam")
if len(parameter_list_weights) == 0:
optimizer_wf: torch.optim.Optimizer | None = None
elif cfg.learning_parameters.learning_rate_gamma_w > 0:
optimizer_wf = Adam(
parameter_list_weights,
parameter_list_sbs,
lr=cfg.learning_parameters.learning_rate_gamma_w,
)
else:
optimizer_wf = Adam(parameter_list_weights, parameter_list_sbs)
elif cfg.learning_parameters.optimizer_name == "SGD":
logging.info("Using optimizer: SGD")
if len(parameter_list_weights) == 0:
optimizer_wf = None
elif cfg.learning_parameters.learning_rate_gamma_w > 0:
optimizer_wf = torch.optim.SGD(
parameter_list_weights,
lr=cfg.learning_parameters.learning_rate_gamma_w,
)
else:
assert cfg.learning_parameters.learning_rate_gamma_w > 0
else:
raise Exception("Optimizer not implemented")
optimizer = []
optimizer.append(optimizer_wf)
return optimizer

View file

@ -0,0 +1,95 @@
# %%
import torch
def calculate_output_size(
value: list[int],
kernel_size: list[int],
stride: list[int],
dilation: list[int],
padding: list[int],
) -> torch.Tensor:
assert len(value) == 2
assert len(kernel_size) == 2
assert len(stride) == 2
assert len(dilation) == 2
assert len(padding) == 2
coordinates_0, coordinates_1 = get_coordinates(
value=value,
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding,
)
output_size: torch.Tensor = torch.tensor(
[
coordinates_0.shape[1],
coordinates_1.shape[1],
],
dtype=torch.int64,
)
return output_size
def get_coordinates(
value: list[int],
kernel_size: list[int],
stride: list[int],
dilation: list[int],
padding: list[int],
) -> tuple[torch.Tensor, torch.Tensor]:
"""Function converts parameter in coordinates
for the convolution window"""
unfold_0: torch.nn.Unfold = torch.nn.Unfold(
kernel_size=(int(kernel_size[0]), 1),
dilation=int(dilation[0]),
padding=int(padding[0]),
stride=int(stride[0]),
)
unfold_1: torch.nn.Unfold = torch.nn.Unfold(
kernel_size=(1, int(kernel_size[1])),
dilation=int(dilation[1]),
padding=int(padding[1]),
stride=int(stride[1]),
)
coordinates_0: torch.Tensor = (
unfold_0(
torch.unsqueeze(
torch.unsqueeze(
torch.unsqueeze(
torch.arange(0, int(value[0]), dtype=torch.float32),
1,
),
0,
),
0,
)
)
.squeeze(0)
.type(torch.int64)
)
coordinates_1: torch.Tensor = (
unfold_1(
torch.unsqueeze(
torch.unsqueeze(
torch.unsqueeze(
torch.arange(0, int(value[1]), dtype=torch.float32),
0,
),
0,
),
0,
)
)
.squeeze(0)
.type(torch.int64)
)
return coordinates_0, coordinates_1

View file

@ -0,0 +1,126 @@
# MIT License
# Copyright 2022 University of Bremen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# David Rotermund ( davrot@uni-bremen.de )
#
#
# Release history:
# ================
# 1.0.0 -- 01.05.2022: first release
#
#
import numpy as np
import pickle
def give_filenames(id: int) -> tuple[str, str, int]:
if id == 0:
start_id: int = 0
prefix: str = "Test"
filename: str = "cifar-10-batches-py/test_batch"
if id == 1:
start_id = 0
prefix = "Train"
filename = "cifar-10-batches-py/data_batch_1"
if id == 2:
start_id = 10000
prefix = "Train"
filename = "cifar-10-batches-py/data_batch_2"
if id == 3:
start_id = 20000
prefix = "Train"
filename = "cifar-10-batches-py/data_batch_3"
if id == 4:
start_id = 30000
prefix = "Train"
filename = "cifar-10-batches-py/data_batch_4"
if id == 5:
start_id = 40000
prefix = "Train"
filename = "cifar-10-batches-py/data_batch_5"
return filename, prefix, start_id
def load_data(filename: str) -> tuple[np.ndarray, np.ndarray]:
fo = open(filename, "rb")
dict_data = pickle.load(fo, encoding="bytes")
_, labels_temp, data_temp, _ = dict_data.items()
data: np.ndarray = np.array(data_temp[1])
labels: np.ndarray = np.array(labels_temp[1])
return data, labels
def split_into_three_color_channels(
image: np.ndarray,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
channel_r = image[0:1024].astype(np.float32)
channel_r = channel_r.reshape(32, 32)
channel_g = image[1024:2048].astype(np.float32)
channel_g = channel_g.reshape(32, 32)
channel_b = image[2048:3072].astype(np.float32)
channel_b = channel_b.reshape(32, 32)
return channel_r, channel_g, channel_b
def process_data_set(test_data_mode: bool) -> None:
if test_data_mode is True:
filename_out_pattern: str = "TestPatternStorage.npy"
filename_out_label: str = "TestLabelStorage.npy"
number_of_pictures: int = 10000
start_id: int = 0
end_id: int = 0
else:
filename_out_pattern = "TrainPatternStorage.npy"
filename_out_label = "TrainLabelStorage.npy"
number_of_pictures = 50000
start_id = 1
end_id = 5
np_data: np.ndarray = np.zeros((number_of_pictures, 32, 32, 3), dtype=np.float32)
np_label: np.ndarray = np.zeros((number_of_pictures), dtype=np.uint64)
for id in range(start_id, end_id + 1):
filename, _, start_id_pattern = give_filenames(id)
pictures, labels = load_data(filename)
for i in range(0, pictures.shape[0]):
channel_r, channel_g, channel_b = split_into_three_color_channels(
pictures[i, :]
)
np_data[i + start_id_pattern, :, :, 0] = channel_r
np_data[i + start_id_pattern, :, :, 1] = channel_g
np_data[i + start_id_pattern, :, :, 2] = channel_b
np_label[i + start_id_pattern] = labels[i]
np_data /= np.max(np_data)
label_storage: np.ndarray = np_label.astype(dtype=np.uint64)
pattern_storage: np.ndarray = np_data.astype(dtype=np.float32)
np.save(filename_out_pattern, pattern_storage)
np.save(filename_out_label, label_storage)
process_data_set(True)
process_data_set(False)

View file

@ -0,0 +1,8 @@
https://www.cs.toronto.edu/~kriz/cifar.html
Download the CIFAR-10 python version
https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Then
tar -xvzf cifar-10-python.tar.gz
python convert.py

View file

@ -0,0 +1,4 @@
{
"data_path": "./DATA_CIFAR10/",
"data_mode": "CIFAR10"
}

View file

@ -0,0 +1,161 @@
# MIT License
# Copyright 2022 University of Bremen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# David Rotermund ( davrot@uni-bremen.de )
#
#
# Release history:
# ================
# 1.0.0 -- 01.05.2022: first release
#
#
import numpy as np
# [offset] [type] [value] [description]
# 0000 32 bit integer 0x00000801(2049) magic number (MSB first)
# 0004 32 bit integer 60000 number of items
# 0008 unsigned byte ?? label
# 0009 unsigned byte ?? label
# ........
# xxxx unsigned byte ?? label
# The labels values are 0 to 9.
class ReadLabel:
"""Class for reading the labels from an MNIST label file"""
def __init__(self, filename):
self.filename: str = filename
self.data = self.read_from_file(filename)
def read_from_file(self, filename):
int32_data = np.dtype(np.uint32)
int32_data = int32_data.newbyteorder(">")
file = open(filename, "rb")
magic_flag = np.frombuffer(file.read(4), int32_data)[0]
if magic_flag != 2049:
data = np.zeros(0)
number_of_elements = 0
else:
number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
if number_of_elements < 1:
data = np.zeros(0)
else:
data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8)
file.close()
return data
# [offset] [type] [value] [description]
# 0000 32 bit integer 0x00000803(2051) magic number
# 0004 32 bit integer 60000 number of images
# 0008 32 bit integer 28 number of rows
# 0012 32 bit integer 28 number of columns
# 0016 unsigned byte ?? pixel
# 0017 unsigned byte ?? pixel
# ........
# xxxx unsigned byte ?? pixel
# Pixels are organized row-wise.
# Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
class ReadPicture:
"""Class for reading the images from an MNIST image file"""
def __init__(self, filename):
self.filename: str = filename
self.data = self.read_from_file(filename)
def read_from_file(self, filename):
int32_data = np.dtype(np.uint32)
int32_data = int32_data.newbyteorder(">")
file = open(filename, "rb")
magic_flag = np.frombuffer(file.read(4), int32_data)[0]
if magic_flag != 2051:
data = np.zeros(0)
number_of_elements = 0
else:
number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
if number_of_elements < 1:
data = np.zeros(0)
number_of_rows = 0
else:
number_of_rows = np.frombuffer(file.read(4), int32_data)[0]
if number_of_rows != 28:
data = np.zeros(0)
number_of_columns = 0
else:
number_of_columns = np.frombuffer(file.read(4), int32_data)[0]
if number_of_columns != 28:
data = np.zeros(0)
else:
data = np.frombuffer(
file.read(number_of_elements * number_of_rows * number_of_columns),
dtype=np.uint8,
)
data = data.reshape(number_of_elements, number_of_columns, number_of_rows)
file.close()
return data
def proprocess_data_set(test_mode):
if test_mode is True:
filename_out_pattern: str = "TestPatternStorage.npy"
filename_out_label: str = "TestLabelStorage.npy"
filename_in_image: str = "t10k-images-idx3-ubyte"
filename_in_label = "t10k-labels-idx1-ubyte"
else:
filename_out_pattern = "TrainPatternStorage.npy"
filename_out_label = "TrainLabelStorage.npy"
filename_in_image = "train-images-idx3-ubyte"
filename_in_label = "train-labels-idx1-ubyte"
pictures = ReadPicture(filename_in_image)
labels = ReadLabel(filename_in_label)
# Down to 0 ... 1.0
max_value = np.max(pictures.data.astype(np.float32))
d = np.float32(pictures.data.astype(np.float32) / max_value)
label_storage = np.uint64(labels.data)
pattern_storage = d.astype(np.float32)
np.save(filename_out_pattern, pattern_storage)
np.save(filename_out_label, label_storage)
proprocess_data_set(True)
proprocess_data_set(False)

View file

@ -0,0 +1,8 @@
https://github.com/zalandoresearch/fashion-mnist
We need:
t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz
Then
gzip -d *.gz
python convert.py

View file

@ -0,0 +1,4 @@
{
"data_path": "./DATA_FASHION_MNIST/",
"data_mode": "MNIST_FASHION"
}

View file

@ -0,0 +1,161 @@
# MIT License
# Copyright 2022 University of Bremen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# David Rotermund ( davrot@uni-bremen.de )
#
#
# Release history:
# ================
# 1.0.0 -- 01.05.2022: first release
#
#
import numpy as np
# [offset] [type] [value] [description]
# 0000 32 bit integer 0x00000801(2049) magic number (MSB first)
# 0004 32 bit integer 60000 number of items
# 0008 unsigned byte ?? label
# 0009 unsigned byte ?? label
# ........
# xxxx unsigned byte ?? label
# The labels values are 0 to 9.
class ReadLabel:
"""Class for reading the labels from an MNIST label file"""
def __init__(self, filename):
self.filename: str = filename
self.data = self.read_from_file(filename)
def read_from_file(self, filename):
int32_data = np.dtype(np.uint32)
int32_data = int32_data.newbyteorder(">")
file = open(filename, "rb")
magic_flag = np.frombuffer(file.read(4), int32_data)[0]
if magic_flag != 2049:
data = np.zeros(0)
number_of_elements = 0
else:
number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
if number_of_elements < 1:
data = np.zeros(0)
else:
data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8)
file.close()
return data
# [offset] [type] [value] [description]
# 0000 32 bit integer 0x00000803(2051) magic number
# 0004 32 bit integer 60000 number of images
# 0008 32 bit integer 28 number of rows
# 0012 32 bit integer 28 number of columns
# 0016 unsigned byte ?? pixel
# 0017 unsigned byte ?? pixel
# ........
# xxxx unsigned byte ?? pixel
# Pixels are organized row-wise.
# Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
class ReadPicture:
"""Class for reading the images from an MNIST image file"""
def __init__(self, filename):
self.filename: str = filename
self.data = self.read_from_file(filename)
def read_from_file(self, filename):
int32_data = np.dtype(np.uint32)
int32_data = int32_data.newbyteorder(">")
file = open(filename, "rb")
magic_flag = np.frombuffer(file.read(4), int32_data)[0]
if magic_flag != 2051:
data = np.zeros(0)
number_of_elements = 0
else:
number_of_elements = np.frombuffer(file.read(4), int32_data)[0]
if number_of_elements < 1:
data = np.zeros(0)
number_of_rows = 0
else:
number_of_rows = np.frombuffer(file.read(4), int32_data)[0]
if number_of_rows != 28:
data = np.zeros(0)
number_of_columns = 0
else:
number_of_columns = np.frombuffer(file.read(4), int32_data)[0]
if number_of_columns != 28:
data = np.zeros(0)
else:
data = np.frombuffer(
file.read(number_of_elements * number_of_rows * number_of_columns),
dtype=np.uint8,
)
data = data.reshape(number_of_elements, number_of_columns, number_of_rows)
file.close()
return data
def proprocess_data_set(test_mode):
if test_mode is True:
filename_out_pattern: str = "TestPatternStorage.npy"
filename_out_label: str = "TestLabelStorage.npy"
filename_in_image: str = "t10k-images-idx3-ubyte"
filename_in_label = "t10k-labels-idx1-ubyte"
else:
filename_out_pattern = "TrainPatternStorage.npy"
filename_out_label = "TrainLabelStorage.npy"
filename_in_image = "train-images-idx3-ubyte"
filename_in_label = "train-labels-idx1-ubyte"
pictures = ReadPicture(filename_in_image)
labels = ReadLabel(filename_in_label)
# Down to 0 ... 1.0
max_value = np.max(pictures.data.astype(np.float32))
d = np.float32(pictures.data.astype(np.float32) / max_value)
label_storage = np.uint64(labels.data)
pattern_storage = d.astype(np.float32)
np.save(filename_out_pattern, pattern_storage)
np.save(filename_out_label, label_storage)
proprocess_data_set(True)
proprocess_data_set(False)

View file

@ -0,0 +1,8 @@
http://yann.lecun.com/exdb/mnist/
We need:
t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz
Then
gzip -d *.gz
python convert.py

View file

@ -0,0 +1,4 @@
{
"data_path": "./DATA_MNIST/",
"data_mode": "MNIST"
}

View file

@ -0,0 +1,144 @@
# %%
import torch
import glob
import numpy as np
from network.SbS import SbS
from network.SplitOnOffLayer import SplitOnOffLayer
from network.Conv2dApproximation import Conv2dApproximation
def load_previous_weights(
network: torch.nn.Sequential,
overload_path: str,
logging,
device: torch.device,
default_dtype: torch.dtype,
) -> None:
for id in range(0, len(network)):
# #################################################
# SbS
# #################################################
if isinstance(network[id], SbS) is True:
# Are there weights that overwrite the initial weights?
file_to_load = glob.glob(overload_path + "/Weight_L" + str(id) + "_*.npy")
if len(file_to_load) > 1:
raise Exception(
f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
)
if len(file_to_load) == 1:
network[id].weights = torch.tensor(
np.load(file_to_load[0]),
dtype=default_dtype,
device=device,
)
logging.info(f"Weights file used for layer {id} : {file_to_load[0]}")
if isinstance(network[id], torch.nn.modules.conv.Conv2d) is True:
# #################################################
# Conv2d weights
# #################################################
# Are there weights that overwrite the initial weights?
file_to_load = glob.glob(overload_path + "/Weight_L" + str(id) + "_*.npy")
if len(file_to_load) > 1:
raise Exception(
f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
)
if len(file_to_load) == 1:
network[id]._parameters["weight"].data = torch.tensor(
np.load(file_to_load[0]),
dtype=default_dtype,
device=device,
)
logging.info(f"Weights file used for layer {id} : {file_to_load[0]}")
# #################################################
# Conv2d bias
# #################################################
# Are there biases that overwrite the initial weights?
file_to_load = glob.glob(overload_path + "/Bias_L" + str(id) + "_*.npy")
if len(file_to_load) > 1:
raise Exception(
f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
)
if len(file_to_load) == 1:
network[id]._parameters["bias"].data = torch.tensor(
np.load(file_to_load[0]),
dtype=default_dtype,
device=device,
)
logging.info(f"Bias file used for layer {id} : {file_to_load[0]}")
if isinstance(network[id], Conv2dApproximation) is True:
# #################################################
# Approximate Conv2d weights
# #################################################
# Are there weights that overwrite the initial weights?
file_to_load = glob.glob(overload_path + "/Weight_L" + str(id) + "_*.npy")
if len(file_to_load) > 1:
raise Exception(
f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
)
if len(file_to_load) == 1:
network[id].weights.data = torch.tensor(
np.load(file_to_load[0]),
dtype=default_dtype,
device=device,
)
logging.info(f"Weights file used for layer {id} : {file_to_load[0]}")
# #################################################
# Approximate Conv2d bias
# #################################################
# Are there biases that overwrite the initial weights?
file_to_load = glob.glob(overload_path + "/Bias_L" + str(id) + "_*.npy")
if len(file_to_load) > 1:
raise Exception(
f"Too many previous weights files {overload_path}/Weight_L{id}*.npy"
)
if len(file_to_load) == 1:
network[id].bias.data = torch.tensor(
np.load(file_to_load[0]),
dtype=default_dtype,
device=device,
)
logging.info(f"Bias file used for layer {id} : {file_to_load[0]}")
# #################################################
# SplitOnOffLayer
# #################################################
if isinstance(network[id], SplitOnOffLayer) is True:
# Are there weights that overwrite the initial weights?
file_to_load = glob.glob(overload_path + "/Mean_L" + str(id) + "_*.npy")
if len(file_to_load) > 1:
raise Exception(
f"Too many previous mean files {overload_path}/Mean_L{id}*.npy"
)
if len(file_to_load) == 1:
network[id].mean = torch.tensor(
np.load(file_to_load[0]),
dtype=default_dtype,
device=device,
)
logging.info(f"Meanfile used for layer {id} : {file_to_load[0]}")

512
network/loop_train_test.py Normal file
View file

@ -0,0 +1,512 @@
import torch
import time
from network.Parameter import Config
from torch.utils.tensorboard import SummaryWriter
from network.SbS import SbS
from network.save_weight_and_bias import save_weight_and_bias
def add_weight_and_bias_to_histogram(
network: torch.nn.modules.container.Sequential,
tb: SummaryWriter,
iteration_number: int,
) -> None:
for id in range(0, len(network)):
# ################################################
# Log the SbS Weights
# ################################################
if isinstance(network[id], SbS) is True:
if network[id]._w_trainable is True:
try:
tb.add_histogram(
f"Weights Layer {id}",
network[id].weights,
iteration_number,
)
except ValueError:
pass
# ################################################
# Log the Conv2 Weights and Biases
# ################################################
if isinstance(network[id], torch.nn.modules.conv.Conv2d) is True:
if network[id]._w_trainable is True:
try:
tb.add_histogram(
f"Weights Layer {id}",
network[id]._parameters["weight"].data,
iteration_number,
)
except ValueError:
pass
try:
tb.add_histogram(
f"Bias Layer {id}",
network[id]._parameters["bias"].data,
iteration_number,
)
except ValueError:
pass
tb.flush()
# loss_mode == 0: "normal" SbS loss function mixture
# loss_mode == 1: cross_entropy
def loss_function(
h: torch.Tensor,
labels: torch.Tensor,
device: torch.device,
default_dtype: torch.dtype,
loss_mode: int = 0,
number_of_output_neurons: int = 10,
loss_coeffs_mse: float = 0.0,
loss_coeffs_kldiv: float = 0.0,
) -> torch.Tensor | None:
assert loss_mode >= 0
assert loss_mode <= 1
h = h.squeeze(-1).squeeze(-1)
assert h.ndim == 2
if loss_mode == 0:
# Convert label into one hot
target_one_hot: torch.Tensor = torch.zeros(
(
labels.shape[0],
number_of_output_neurons,
),
device=device,
dtype=default_dtype,
)
target_one_hot.scatter_(
1,
labels.to(device).unsqueeze(1),
torch.ones(
(labels.shape[0], 1),
device=device,
dtype=default_dtype,
),
).unsqueeze(-1).unsqueeze(-1)
h_y1 = torch.log(h + 1e-20)
my_loss: torch.Tensor = (
torch.nn.functional.mse_loss(
h,
target_one_hot,
reduction="sum",
)
* loss_coeffs_mse
+ torch.nn.functional.kl_div(h_y1, target_one_hot + 1e-20, reduction="sum")
* loss_coeffs_kldiv
) / (loss_coeffs_kldiv + loss_coeffs_mse)
return my_loss
elif loss_mode == 1:
my_loss = torch.nn.functional.cross_entropy(
h.squeeze(-1).squeeze(-1), labels.to(device)
)
return my_loss
else:
return None
def forward_pass_train(
input: torch.Tensor,
labels: torch.Tensor,
the_dataset_train,
cfg: Config,
network: torch.nn.modules.container.Sequential,
device: torch.device,
default_dtype: torch.dtype,
) -> list[torch.Tensor]:
h_collection = []
h_collection.append(
the_dataset_train.pattern_filter_train(input, cfg)
.type(dtype=default_dtype)
.to(device=device)
)
for id in range(0, len(network)):
if isinstance(network[id], SbS) is True:
h_collection.append(network[id](h_collection[-1], labels))
else:
h_collection.append(network[id](h_collection[-1]))
return h_collection
def forward_pass_test(
input: torch.Tensor,
the_dataset_test,
cfg: Config,
network: torch.nn.modules.container.Sequential,
device: torch.device,
default_dtype: torch.dtype,
) -> list[torch.Tensor]:
h_collection = []
h_collection.append(
the_dataset_test.pattern_filter_test(input, cfg)
.type(dtype=default_dtype)
.to(device=device)
)
for id in range(0, len(network)):
h_collection.append(network[id](h_collection[-1]))
return h_collection
def run_optimizer(
network: torch.nn.modules.container.Sequential,
optimizer: list,
cfg: Config,
) -> None:
for id in range(0, len(network)):
if isinstance(network[id], SbS) is True:
network[id].update_pre_care()
for optimizer_item in optimizer:
if optimizer_item is not None:
optimizer_item.step()
for id in range(0, len(network)):
if isinstance(network[id], SbS) is True:
network[id].update_after_care(
cfg.learning_parameters.learning_rate_threshold_w
/ float(
network[id]._number_of_input_neurons
# * network[id]._kernel_size[0]
# * network[id]._kernel_size[1]
),
)
# ####################################
# Update the learning rate
# ####################################
def run_lr_scheduler(
cfg: Config,
lr_scheduler,
optimizer,
performance_for_batch: float,
my_loss_for_batch: float,
tb,
logging,
) -> None:
# Inter-epoch learning rate adaptation
for lr_scheduler_item in lr_scheduler:
if (
(lr_scheduler_item is not None)
and (performance_for_batch >= 0.0)
and (my_loss_for_batch >= 0.0)
):
if cfg.learning_parameters.lr_scheduler_use_performance is True:
lr_scheduler_item.step(100.0 - performance_for_batch)
else:
lr_scheduler_item.step(my_loss_for_batch)
tb.add_scalar(
"Train Error",
100.0 - performance_for_batch,
cfg.epoch_id,
)
tb.add_scalar("Train Loss", my_loss_for_batch, cfg.epoch_id)
tb.add_scalar(
"Learning Rate Scale WF",
optimizer[0].param_groups[-1]["lr"],
cfg.epoch_id,
)
tb.flush()
# def deal_with_gradient_scale(epoch_id: int, mini_batch_number: int, network):
# if (epoch_id == 0) and (mini_batch_number == 0):
# for id in range(0, len(network)):
# if isinstance(network[id], SbS) is True:
# network[id].after_batch(True)
# else:
# for id in range(0, len(network)):
# if isinstance(network[id], SbS) is True:
# network[id].after_batch()
def loop_train(
cfg: Config,
network: torch.nn.modules.container.Sequential,
my_loader_train: torch.utils.data.dataloader.DataLoader,
the_dataset_train,
optimizer: list,
device: torch.device,
default_dtype: torch.dtype,
logging,
adapt_learning_rate: bool,
tb: SummaryWriter,
lr_scheduler,
last_test_performance: float,
) -> tuple[float, float, float, float]:
correct_in_minibatch: int = 0
loss_in_minibatch: float = 0.0
number_of_pattern_in_minibatch: int = 0
mini_batch_number: int = -1
full_loss: float = 0.0
full_correct: float = 0.0
full_count: float = 0.0
epoch_id: int = cfg.epoch_id
my_loss_for_batch: float = -1.0
performance_for_batch: float = -1.0
time_forward: float = 0.0
time_backward: float = 0.0
with torch.enable_grad():
for h_x, h_x_labels in my_loader_train:
time_mini_batch_start: float = time.perf_counter()
# ############################################################
# Reset the gradient after an update (or the first loop pass)
# ############################################################
if number_of_pattern_in_minibatch == 0:
# Reset the gradient of the torch optimizers
for optimizer_item in optimizer:
if optimizer_item is not None:
optimizer_item.zero_grad()
loss_in_minibatch = 0.0
mini_batch_number += 1
correct_in_minibatch = 0
time_forward = 0.0
time_backward = 0.0
# ####################################
# Update the learning rate
# ####################################
if adapt_learning_rate is True:
run_lr_scheduler(
cfg=cfg,
lr_scheduler=lr_scheduler,
optimizer=optimizer,
performance_for_batch=performance_for_batch,
my_loss_for_batch=my_loss_for_batch,
tb=tb,
logging=logging,
)
logging.info(
(
f"\t\t\tLearning rate: "
f"weights:{optimizer[0].param_groups[-1]['lr']:^15.3e} "
)
)
if last_test_performance < 0:
logging.info("")
else:
logging.info(
(
f"\t\t\tLast test performance: "
f"{last_test_performance/100.0:^6.2%}"
)
)
logging.info("----------------")
number_of_pattern_in_minibatch += h_x_labels.shape[0]
full_count += h_x_labels.shape[0]
# #####################################################
# The network does the forward pass (training)
# #####################################################
h_collection = forward_pass_train(
input=h_x,
labels=h_x_labels,
the_dataset_train=the_dataset_train,
cfg=cfg,
network=network,
device=device,
default_dtype=default_dtype,
)
# #####################################################
# Calculate the loss function
# #####################################################
my_loss: torch.Tensor | None = loss_function(
h=h_collection[-1],
labels=h_x_labels,
device=device,
default_dtype=default_dtype,
loss_mode=cfg.learning_parameters.loss_mode,
number_of_output_neurons=int(
cfg.network_structure.number_of_output_neurons
),
loss_coeffs_mse=float(cfg.learning_parameters.loss_coeffs_mse),
loss_coeffs_kldiv=float(cfg.learning_parameters.loss_coeffs_kldiv),
)
assert my_loss is not None
time_after_forward_and_loss: float = time.perf_counter()
# #####################################################
# Backward pass
# #####################################################
my_loss.backward()
loss_in_minibatch += my_loss.item()
full_loss += my_loss.item()
time_after_backward: float = time.perf_counter()
# #####################################################
# Performance measures
# #####################################################
correct_in_minibatch += (
(h_collection[-1].argmax(dim=1).squeeze().cpu() == h_x_labels)
.sum()
.item()
)
full_correct += (
(h_collection[-1].argmax(dim=1).squeeze().cpu() == h_x_labels)
.sum()
.item()
)
# We measure the scale of the propagated error
# during the first minibatch
# then we remember this size and scale
# the future error with it
# Kind of deals with the vanishing /
# exploding gradients
# deal_with_gradient_scale(
# epoch_id=epoch_id,
# mini_batch_number=mini_batch_number,
# network=network,
# )
# Measure the time for one mini-batch
time_forward += time_after_forward_and_loss - time_mini_batch_start
time_backward += time_after_backward - time_after_forward_and_loss
if number_of_pattern_in_minibatch >= cfg.get_update_after_x_pattern():
logging.info(
(
f"{epoch_id:^6}=>{mini_batch_number:^6} "
f"\t\tTraining {number_of_pattern_in_minibatch^6} pattern "
f"with {correct_in_minibatch/number_of_pattern_in_minibatch:^6.2%} "
f"\tForward time: \t{time_forward:^6.2f}sec"
)
)
logging.info(
(
f"\t\t\tLoss: {loss_in_minibatch/number_of_pattern_in_minibatch:^15.3e} "
f"\t\t\tBackward time: \t{time_backward:^6.2f}sec "
)
)
my_loss_for_batch = loss_in_minibatch / number_of_pattern_in_minibatch
performance_for_batch = (
100.0 * correct_in_minibatch / number_of_pattern_in_minibatch
)
# ################################################
# Update the weights and biases
# ################################################
run_optimizer(network=network, optimizer=optimizer, cfg=cfg)
# ################################################
# Save the Weights and Biases
# ################################################
save_weight_and_bias(
cfg=cfg, network=network, iteration_number=epoch_id
)
# ################################################
# Log the Weights and Biases
# ################################################
add_weight_and_bias_to_histogram(
network=network,
tb=tb,
iteration_number=epoch_id,
)
# ################################################
# Mark mini batch as done
# ################################################
number_of_pattern_in_minibatch = 0
return (
my_loss_for_batch,
performance_for_batch,
(full_loss / full_count),
(100.0 * full_correct / full_count),
)
def loop_test(
epoch_id: int,
cfg: Config,
network: torch.nn.modules.container.Sequential,
my_loader_test: torch.utils.data.dataloader.DataLoader,
the_dataset_test,
device: torch.device,
default_dtype: torch.dtype,
logging,
tb: SummaryWriter,
) -> float:
test_correct = 0
test_count = 0
test_complete: int = the_dataset_test.__len__()
logging.info("")
logging.info("Testing:")
for h_x, h_x_labels in my_loader_test:
time_0 = time.perf_counter()
h_collection = forward_pass_test(
input=h_x,
the_dataset_test=the_dataset_test,
cfg=cfg,
network=network,
device=device,
default_dtype=default_dtype,
)
h_h: torch.Tensor = h_collection[-1].detach().clone().cpu()
test_correct += (h_h.argmax(dim=1).squeeze() == h_x_labels).sum().numpy()
test_count += h_h.shape[0]
performance = 100.0 * test_correct / test_count
time_1 = time.perf_counter()
time_measure_a = time_1 - time_0
logging.info(
(
f"\t\t{test_count} of {test_complete}"
f" with {performance/100:^6.2%} \t Time used: {time_measure_a:^6.2f}sec"
)
)
logging.info("")
tb.add_scalar("Test Error", 100.0 - performance, epoch_id)
tb.flush()
return performance

View file

@ -0,0 +1,71 @@
import torch
from network.Parameter import Config
import numpy as np
from network.SbS import SbS
from network.SplitOnOffLayer import SplitOnOffLayer
from network.Conv2dApproximation import Conv2dApproximation
def save_weight_and_bias(
cfg: Config, network: torch.nn.modules.container.Sequential, iteration_number: int
) -> None:
for id in range(0, len(network)):
# ################################################
# Save the SbS Weights
# ################################################
if isinstance(network[id], SbS) is True:
if network[id]._w_trainable is True:
np.save(
f"{cfg.weight_path}/Weight_L{id}_S{iteration_number}.npy",
network[id].weights.detach().cpu().numpy(),
)
# ################################################
# Save the Conv2 Weights and Biases
# ################################################
if isinstance(network[id], torch.nn.modules.conv.Conv2d) is True:
if network[id]._w_trainable is True:
# Save the new values
np.save(
f"{cfg.weight_path}/Weight_L{id}_S{iteration_number}.npy",
network[id]._parameters["weight"].data.detach().cpu().numpy(),
)
# Save the new values
np.save(
f"{cfg.weight_path}/Bias_L{id}_S{iteration_number}.npy",
network[id]._parameters["bias"].data.detach().cpu().numpy(),
)
# ################################################
# Save the Approximate Conv2 Weights and Biases
# ################################################
if isinstance(network[id], Conv2dApproximation) is True:
if network[id]._w_trainable is True:
# Save the new values
np.save(
f"{cfg.weight_path}/Weight_L{id}_S{iteration_number}.npy",
network[id].weights.data.detach().cpu().numpy(),
)
# Save the new values
if network[id].bias is not None:
np.save(
f"{cfg.weight_path}/Bias_L{id}_S{iteration_number}.npy",
network[id].bias.data.detach().cpu().numpy(),
)
if isinstance(network[id], SplitOnOffLayer) is True:
np.save(
f"{cfg.weight_path}/Mean_L{id}_S{iteration_number}.npy",
network[id].mean.detach().cpu().numpy(),
)

View file

@ -0,0 +1,4 @@
{
"data_path": "./DATA_FASHION_MNIST/",
"data_mode": "MNIST_FASHION"
}

View file

@ -0,0 +1,29 @@
{
"epoch_id_max": 200,
"batch_size": 24,
"stage_id": 0,
"simulation_id": 0,
"learning_parameters": {
"loss_mode": 1, // X-Entropy
"number_of_batches_for_one_update": 20,
"learning_rate_gamma_w": 0.001,
"lr_scheduler_patience_w": -1,
"adapt_learning_rate_after_minibatch": false,
"w_trainable": [
true
]
},
"augmentation": {},
"image_statistics": {},
"approximation_setting": {
"number_of_frac_bits": [
30
], // Quantization of the mantissa
"approximation_enable": [
true
], // In addition to the quantization more approx
"number_of_trunc_bits": [
1
]
}
}

View file

@ -0,0 +1,19 @@
{
"epoch_id_max": 200,
"batch_size": 24,
"stage_id": 0,
"simulation_id": 0,
"learning_parameters": {
"loss_mode": 1, // X-Entropy
"number_of_batches_for_one_update": 20,
"learning_rate_gamma_w": 0.001,
"lr_scheduler_patience_w": -1,
"adapt_learning_rate_after_minibatch": false,
"w_trainable": [
true
]
},
"augmentation": {},
"image_statistics": {},
"approximation_setting": {}
}

View file

@ -0,0 +1,97 @@
{
"network_structure": {
"number_of_output_neurons": 10,
"layer_type": [
"APPROX CONV2D",
"RELU",
"MAX POOLING",
"APPROX CONV2D",
"RELU",
"MAX POOLING",
"APPROX CONV2D",
"RELU",
"APPROX CONV2D"
],
"strides": [
[
1,
1
], // "CONV2D"
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
1,
1
], // "CONV2D"
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
1,
1
], // "CONV2D"
[], // "RELU",
[
1,
1
] // "CONV2D"
],
"forward_neuron_numbers": [
[
1,
32
], // "CONV2D",
[], // "RELU",
[], // "MAX POOLING",
[
32,
64
], // "CONV2D",
[], // "RELU",
[], // "MAX POOLING",
[
64,
96
], // "CONV2D",
[], // "RELU",
[
96,
10
] // "CONV2D",
],
"forward_kernel_size": [
[
5,
5
], // "CONV2D",
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
5,
5
], // "CONV2D",
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
3,
3
], // "CONV2D",
[], // "RELU",
[
1,
1
] // "CONV2D",
]
}
}

View file

@ -0,0 +1,97 @@
{
"network_structure": {
"number_of_output_neurons": 10,
"layer_type": [
"CONV2D",
"RELU",
"MAX POOLING",
"CONV2D",
"RELU",
"MAX POOLING",
"CONV2D",
"RELU",
"CONV2D"
],
"strides": [
[
1,
1
], // "CONV2D"
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
1,
1
], // "CONV2D"
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
1,
1
], // "CONV2D"
[], // "RELU",
[
1,
1
] // "CONV2D"
],
"forward_neuron_numbers": [
[
1,
32
], // "CONV2D",
[], // "RELU",
[], // "MAX POOLING",
[
32,
64
], // "CONV2D",
[], // "RELU",
[], // "MAX POOLING",
[
64,
96
], // "CONV2D",
[], // "RELU",
[
96,
10
] // "CONV2D",
],
"forward_kernel_size": [
[
5,
5
], // "CONV2D",
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
5,
5
], // "CONV2D",
[], // "RELU",
[
2,
2
], // "MAX POOLING",
[
3,
3
], // "CONV2D",
[], // "RELU",
[
1,
1
] // "CONV2D",
]
}
}

View file

@ -0,0 +1,186 @@
import torch
import math
from network.CPP.PyMultiApp import MultiApp
class LinearApproximation(torch.nn.Module):
in_features: int | None = None
out_features: int | None = None
use_bias: bool = False
approximation_enable: bool = False
number_of_trunc_bits: int = -1
number_of_frac: int = -1
number_of_processes: int = 1
weights: torch.nn.parameter.Parameter
bias: torch.nn.parameter.Parameter | None
device: torch.device
dtype: torch.dtype
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = True,
approximation_enable: bool = False,
number_of_trunc_bits: int = -1,
number_of_frac: int = -1,
number_of_processes: int = 1,
device: torch.device | None = None,
dtype: torch.dtype | None = None,
) -> None:
super().__init__()
assert device is not None
self.device = device
assert dtype is not None
self.dtype = dtype
self.in_features = in_features
self.out_channels = out_features
self.use_bias = bias
self.approximation_enable = approximation_enable
self.number_of_trunc_bits = number_of_trunc_bits
self.number_of_frac = number_of_frac
self.number_of_processes = number_of_processes
if self.use_bias is True:
self.bias: torch.nn.parameter.Parameter | None = (
torch.nn.parameter.Parameter(
torch.empty(
(out_features),
dtype=self.dtype,
device=self.device,
)
)
)
else:
self.bias = None
self.weights: torch.nn.parameter.Parameter = torch.nn.parameter.Parameter(
torch.empty(
(out_features, in_features),
dtype=self.dtype,
device=self.device,
)
)
self.functional_multi = FunctionalMultiLinear.apply
self.reset_parameters()
def reset_parameters(self) -> None:
# Stolen from original torch conv2 code
torch.nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights)
if fan_in != 0:
bound = 1 / math.sqrt(fan_in)
torch.nn.init.uniform_(self.bias, -bound, bound)
def forward(self, input: torch.Tensor) -> torch.Tensor:
assert input.dim() == 2
parameter_list = torch.tensor(
[
int(self.approximation_enable), # 0
int(self.number_of_trunc_bits), # 1
int(self.number_of_frac), # 2
int(self.number_of_processes), # 3
],
dtype=torch.int64,
)
output = self.functional_multi(
input.unsqueeze(-1).unsqueeze(-1), self.weights, parameter_list
)
output = output.squeeze(-1).squeeze(-1)
if self.bias is not None:
output += self.bias.unsqueeze(0)
return output
class FunctionalMultiLinear(torch.autograd.Function):
@staticmethod
def forward( # type: ignore
ctx,
input: torch.Tensor,
weights: torch.Tensor,
parameter_list: torch.Tensor,
) -> torch.Tensor:
assert input.ndim == 4
assert input.dtype is torch.float32
assert input.is_contiguous() is True
assert weights.ndim == 2
assert weights.dtype is torch.float32
assert weights.is_contiguous() is True
assert input.shape[1] == weights.shape[1]
approximation_enable = bool(parameter_list[0])
number_of_trunc_bits = int(parameter_list[1])
number_of_frac = int(parameter_list[2])
number_of_processes = int(parameter_list[3])
assert input.device == weights.device
output = torch.zeros(
(input.shape[0], weights.shape[0], input.shape[2], input.shape[3]),
dtype=weights.dtype,
device=weights.device,
requires_grad=True,
)
assert output.is_contiguous() is True
multiplier: MultiApp = MultiApp()
multiplier.update_with_init_vector_multi_pattern(
input.data_ptr(),
weights.data_ptr(),
output.data_ptr(),
int(output.shape[0]), # pattern
int(output.shape[1]), # feature channel
int(output.shape[2]), # x
int(output.shape[3]), # y
int(input.shape[1]), # input channel
int(number_of_processes),
bool(approximation_enable),
int(number_of_trunc_bits),
int(number_of_frac),
)
ctx.save_for_backward(
input.detach(),
weights.detach(),
)
return output
@staticmethod
def backward(ctx, grad_output):
(input, weights) = ctx.saved_tensors
grad_input = (
grad_output.unsqueeze(2) * weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
).sum(1)
grad_weights = (
(grad_output.unsqueeze(2) * input.unsqueeze(1)).sum(0).sum(-1).sum(-1)
)
grad_parameter_list = None
return (grad_input, grad_weights, grad_parameter_list)

39
network_sbs_L0.json Normal file
View file

@ -0,0 +1,39 @@
{
"network_structure": {
"number_of_output_neurons": 10,
"layer_type": [
"SbS",
"SbS"
],
"strides": [
[
1,
1
],
[
1,
1
]
],
"forward_neuron_numbers": [
[
1,
32
],
[
32,
10
]
],
"forward_kernel_size": [
[
5,
5
],
[
20,
20
]
]
}
}

91
network_sbs_mp.json Normal file
View file

@ -0,0 +1,91 @@
{
"network_structure": {
"number_of_output_neurons": 10,
"layer_type": [
"SbS",
"MAX POOLING",
"SbS",
"MAX POOLING",
"SbS",
"SbS"
],
"strides": [
[
1,
1
], // "SbS"
[
2,
2
], // POOLING
[
1,
1
], // "SbS"
[
2,
2
], // POOLING
[
1,
1
], // "SbS"
[
1,
1
] // "SbS"
],
"forward_neuron_numbers": [
[
1,
32
], // "SbS"
[
32,
32
], // POOLING
[
32,
64
], // "SbS"
[
64,
64
], // POOLING
[
64,
96
], // "SbS"
[
96,
10
] // "SbS"
],
"forward_kernel_size": [
[
5,
5
], // "SbS"
[
2,
2
], // POOLING
[
5,
5
], // "SbS"
[
2,
2
], // POOLING
[
3,
3
], // "SbS"
[
1,
1
] // "SbS"
]
}
}

19
run_spikes.sh Normal file
View file

@ -0,0 +1,19 @@
python=~/P3.10GPU/bin/python3
if [ -f "./def.json" ]
then
echo "json file found"
else
echo "No json file exists"
exit
fi
$python learn_it.py def.json
status=$?
if [ $status != "0" ]
then
echo "Problem detected (learn_it.py)!!!"
exit 1
fi