389 lines
12 KiB
Python
389 lines
12 KiB
Python
import torch
|
|
import math
|
|
|
|
from network.PyMultiplicationApproximationCPU import MultiplicationApproximationCPU
|
|
from network.PyMultiplicationApproximationGPU import MultiplicationApproximationGPU
|
|
|
|
global_multiapp_gpu_setting: list[torch.Tensor] = []
|
|
global_multiapp_size: list[torch.Tensor] = []
|
|
global_multiapp_cpp: list[
|
|
MultiplicationApproximationCPU | MultiplicationApproximationGPU
|
|
] = []
|
|
|
|
|
|
class Conv2dApproximation(torch.nn.Module):
|
|
|
|
in_channels: int | None = None
|
|
out_channels: int | None = None
|
|
kernel_size: list[int] | None = None
|
|
stride: list[int] = [1, 1]
|
|
padding: list[int] = [0, 0]
|
|
dilation: list[int] = [1, 1]
|
|
use_bias: bool = False
|
|
|
|
approximation_enable: bool = False
|
|
number_of_trunc_bits: int = -1
|
|
number_of_frac: int = -1
|
|
|
|
number_of_processes: int = 1
|
|
|
|
weights: torch.nn.parameter.Parameter
|
|
bias: torch.nn.parameter.Parameter | None
|
|
|
|
device: torch.device
|
|
dtype: torch.dtype
|
|
|
|
multiapp_gpu_setting_position: int = -1
|
|
multiapp_cpp_position: int = -1
|
|
|
|
def __init__(
|
|
self,
|
|
in_channels: int,
|
|
out_channels: int,
|
|
kernel_size: list[int],
|
|
stride: list[int] = [1, 1],
|
|
padding: list[int] = [0, 0],
|
|
dilation: list[int] = [1, 1],
|
|
bias: bool = True,
|
|
approximation_enable: bool = False,
|
|
number_of_trunc_bits: int = -1,
|
|
number_of_frac: int = -1,
|
|
number_of_processes: int = 1,
|
|
device: torch.device | None = None,
|
|
dtype: torch.dtype | None = None,
|
|
) -> None:
|
|
super().__init__()
|
|
|
|
assert device is not None
|
|
self.device = device
|
|
|
|
assert dtype is not None
|
|
self.dtype = dtype
|
|
|
|
assert len(kernel_size) == 2
|
|
assert len(stride) == 2
|
|
assert len(padding) == 2
|
|
assert len(dilation) == 2
|
|
|
|
self.in_channels = in_channels
|
|
self.out_channels = out_channels
|
|
self.kernel_size = kernel_size
|
|
self.stride = stride
|
|
self.padding = padding
|
|
self.dilation = dilation
|
|
self.use_bias = bias
|
|
self.number_of_processes = number_of_processes
|
|
|
|
self.approximation_enable = approximation_enable
|
|
self.number_of_trunc_bits = number_of_trunc_bits
|
|
self.number_of_frac = number_of_frac
|
|
|
|
global_multiapp_gpu_setting.append(torch.tensor([0]))
|
|
global_multiapp_size.append(torch.tensor([0, 0, 0, 0]))
|
|
if device == torch.device("cpu"):
|
|
global_multiapp_cpp.append(MultiplicationApproximationCPU())
|
|
else:
|
|
global_multiapp_cpp.append(MultiplicationApproximationGPU())
|
|
|
|
self.multiapp_gpu_setting_position = len(global_multiapp_gpu_setting) - 1
|
|
self.multiapp_cpp_position = len(global_multiapp_cpp) - 1
|
|
|
|
if self.use_bias is True:
|
|
self.bias: torch.nn.parameter.Parameter | None = (
|
|
torch.nn.parameter.Parameter(
|
|
torch.empty(
|
|
(out_channels),
|
|
dtype=self.dtype,
|
|
device=self.device,
|
|
)
|
|
)
|
|
)
|
|
else:
|
|
self.bias = None
|
|
|
|
self.weights: torch.nn.parameter.Parameter = torch.nn.parameter.Parameter(
|
|
torch.empty(
|
|
(out_channels, in_channels, *kernel_size),
|
|
dtype=self.dtype,
|
|
device=self.device,
|
|
)
|
|
)
|
|
|
|
self.functional_multi = FunctionalMultiConv2d.apply
|
|
|
|
self.reset_parameters()
|
|
|
|
def reset_parameters(self) -> None:
|
|
# Stolen from original torch conv2 code
|
|
torch.nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5))
|
|
if self.bias is not None:
|
|
fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights)
|
|
if fan_in != 0:
|
|
bound = 1 / math.sqrt(fan_in)
|
|
torch.nn.init.uniform_(self.bias, -bound, bound)
|
|
|
|
def calculate_output_size(self, value: torch.Tensor) -> None:
|
|
|
|
coordinates_0, coordinates_1 = self._get_coordinates(value)
|
|
|
|
self.output_size: torch.Tensor = torch.tensor(
|
|
[
|
|
coordinates_0.shape[1],
|
|
coordinates_1.shape[1],
|
|
],
|
|
dtype=torch.int64,
|
|
)
|
|
self.output_size.requires_grad_(False)
|
|
|
|
def _get_coordinates(
|
|
self, value: torch.Tensor
|
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
"""Function converts parameter in coordinates
|
|
for the convolution window"""
|
|
|
|
assert value is not None
|
|
assert torch.is_tensor(value) is True
|
|
assert value.dim() == 1
|
|
assert torch.numel(value) == 2
|
|
assert value.dtype == torch.int64
|
|
assert value[0] > 0
|
|
assert value[1] > 0
|
|
|
|
assert self.kernel_size is not None
|
|
assert len(self.kernel_size) == 2
|
|
assert len(self.stride) == 2
|
|
assert len(self.dilation) == 2
|
|
assert len(self.padding) == 2
|
|
|
|
unfold_0: torch.nn.Unfold = torch.nn.Unfold(
|
|
kernel_size=(int(self.kernel_size[0]), 1),
|
|
dilation=int(self.dilation[0]),
|
|
padding=int(self.padding[0]),
|
|
stride=int(self.stride[0]),
|
|
)
|
|
|
|
unfold_1: torch.nn.Unfold = torch.nn.Unfold(
|
|
kernel_size=(1, int(self.kernel_size[1])),
|
|
dilation=int(self.dilation[1]),
|
|
padding=int(self.padding[1]),
|
|
stride=int(self.stride[1]),
|
|
)
|
|
|
|
coordinates_0: torch.Tensor = (
|
|
unfold_0(
|
|
torch.unsqueeze(
|
|
torch.unsqueeze(
|
|
torch.unsqueeze(
|
|
torch.arange(0, int(value[0]), dtype=torch.float32),
|
|
1,
|
|
),
|
|
0,
|
|
),
|
|
0,
|
|
)
|
|
)
|
|
.squeeze(0)
|
|
.type(torch.int64)
|
|
)
|
|
|
|
coordinates_1: torch.Tensor = (
|
|
unfold_1(
|
|
torch.unsqueeze(
|
|
torch.unsqueeze(
|
|
torch.unsqueeze(
|
|
torch.arange(0, int(value[1]), dtype=torch.float32),
|
|
0,
|
|
),
|
|
0,
|
|
),
|
|
0,
|
|
)
|
|
)
|
|
.squeeze(0)
|
|
.type(torch.int64)
|
|
)
|
|
|
|
return coordinates_0, coordinates_1
|
|
|
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
|
|
|
assert input.dim() == 4
|
|
|
|
assert self.kernel_size is not None
|
|
assert self.multiapp_gpu_setting_position != -1
|
|
assert self.multiapp_cpp_position != -1
|
|
|
|
input_size = torch.Tensor([int(input.shape[-2]), int(input.shape[-1])]).type(
|
|
dtype=torch.int64
|
|
)
|
|
|
|
self.calculate_output_size(input_size)
|
|
|
|
input_fold = torch.nn.functional.fold(
|
|
torch.nn.functional.unfold(
|
|
input.requires_grad_(True),
|
|
tuple(self.kernel_size),
|
|
tuple(self.dilation),
|
|
tuple(self.padding),
|
|
tuple(self.stride),
|
|
),
|
|
output_size=(int(self.output_size[0]), int(self.output_size[1])),
|
|
kernel_size=(1, 1),
|
|
dilation=(1, 1),
|
|
padding=(0, 0),
|
|
stride=(1, 1),
|
|
)
|
|
|
|
weights_fold = torch.nn.functional.unfold(
|
|
self.weights.requires_grad_(True),
|
|
tuple(self.kernel_size),
|
|
tuple(self.dilation),
|
|
tuple(self.padding),
|
|
tuple(self.stride),
|
|
).squeeze(-1)
|
|
|
|
if input.device == torch.device("cpu"):
|
|
number_of_cpu_processes: int = int(self.number_of_processes)
|
|
else:
|
|
number_of_cpu_processes = -1
|
|
|
|
# Here...
|
|
parameter_list = torch.tensor(
|
|
[
|
|
int(self.approximation_enable), # 0
|
|
int(self.number_of_trunc_bits), # 1
|
|
int(self.number_of_frac), # 2
|
|
int(number_of_cpu_processes), # 3
|
|
int(self.multiapp_gpu_setting_position), # 4
|
|
int(self.multiapp_cpp_position), # 5
|
|
],
|
|
dtype=torch.int64,
|
|
)
|
|
|
|
output = self.functional_multi(input_fold, weights_fold, parameter_list)
|
|
|
|
if self.bias is not None:
|
|
output += self.bias.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
|
|
|
return output
|
|
|
|
|
|
class FunctionalMultiConv2d(torch.autograd.Function):
|
|
@staticmethod
|
|
def forward( # type: ignore
|
|
ctx,
|
|
input: torch.Tensor,
|
|
weights: torch.Tensor,
|
|
parameter_list: torch.Tensor,
|
|
) -> torch.Tensor:
|
|
|
|
assert input.ndim == 4
|
|
assert input.dtype is torch.float32
|
|
assert input.is_contiguous() is True
|
|
|
|
assert weights.ndim == 2
|
|
assert weights.dtype is torch.float32
|
|
assert weights.is_contiguous() is True
|
|
|
|
assert input.shape[1] == weights.shape[1]
|
|
|
|
approximation_enable = bool(parameter_list[0])
|
|
number_of_trunc_bits = int(parameter_list[1])
|
|
number_of_frac = int(parameter_list[2])
|
|
number_of_processes = int(parameter_list[3])
|
|
multiapp_gpu_setting_position = int(parameter_list[4])
|
|
multiapp_cpp_position = int(parameter_list[5])
|
|
|
|
assert input.device == weights.device
|
|
|
|
output = torch.empty(
|
|
(input.shape[0], weights.shape[0], input.shape[2], input.shape[3]),
|
|
dtype=weights.dtype,
|
|
device=weights.device,
|
|
requires_grad=True,
|
|
)
|
|
assert output.is_contiguous() is True
|
|
|
|
multiapp_profile = global_multiapp_gpu_setting[
|
|
multiapp_gpu_setting_position
|
|
].clone()
|
|
|
|
multiapp_size = global_multiapp_size[multiapp_gpu_setting_position].clone()
|
|
|
|
if input.device != torch.device("cpu"):
|
|
if (
|
|
(multiapp_profile.numel() == 1)
|
|
or (multiapp_size[0] != int(output.shape[0]))
|
|
or (multiapp_size[1] != int(output.shape[1]))
|
|
or (multiapp_size[2] != int(output.shape[2]))
|
|
or (multiapp_size[3] != int(output.shape[3]))
|
|
):
|
|
multiapp_profile = torch.zeros(
|
|
(1, 7), dtype=torch.int64, device=torch.device("cpu")
|
|
)
|
|
|
|
global_multiapp_cpp[multiapp_cpp_position].gpu_occupancy_export(
|
|
int(output.shape[2]),
|
|
int(output.shape[3]),
|
|
int(output.shape[0]),
|
|
int(output.shape[1]),
|
|
multiapp_profile.data_ptr(),
|
|
int(multiapp_profile.shape[0]),
|
|
int(multiapp_profile.shape[1]),
|
|
)
|
|
global_multiapp_gpu_setting[
|
|
multiapp_gpu_setting_position
|
|
] = multiapp_profile.clone()
|
|
|
|
multiapp_size[0] = int(output.shape[0])
|
|
multiapp_size[1] = int(output.shape[1])
|
|
multiapp_size[2] = int(output.shape[2])
|
|
multiapp_size[3] = int(output.shape[3])
|
|
|
|
global_multiapp_size[
|
|
multiapp_gpu_setting_position
|
|
] = multiapp_size.clone()
|
|
|
|
else:
|
|
global_multiapp_cpp[multiapp_cpp_position].gpu_occupancy_import(
|
|
multiapp_profile.data_ptr(),
|
|
int(multiapp_profile.shape[0]),
|
|
int(multiapp_profile.shape[1]),
|
|
)
|
|
|
|
global_multiapp_cpp[multiapp_cpp_position].update_entrypoint(
|
|
input.data_ptr(),
|
|
weights.data_ptr(),
|
|
output.data_ptr(),
|
|
int(output.shape[0]), # pattern
|
|
int(output.shape[1]), # feature channel
|
|
int(output.shape[2]), # x
|
|
int(output.shape[3]), # y
|
|
int(input.shape[1]), # input channel
|
|
int(number_of_processes),
|
|
bool(approximation_enable),
|
|
int(number_of_trunc_bits),
|
|
int(number_of_frac),
|
|
)
|
|
|
|
ctx.save_for_backward(
|
|
input.detach(),
|
|
weights.detach(),
|
|
)
|
|
|
|
return output
|
|
|
|
@staticmethod
|
|
def backward(ctx, grad_output):
|
|
|
|
(input, weights) = ctx.saved_tensors
|
|
|
|
grad_input = (
|
|
grad_output.unsqueeze(2) * weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
|
).sum(1)
|
|
grad_weights = (
|
|
(grad_output.unsqueeze(2) * input.unsqueeze(1)).sum(0).sum(-1).sum(-1)
|
|
)
|
|
grad_parameter_list = None
|
|
|
|
return (grad_input, grad_weights, grad_parameter_list)
|