import torch
from non_linear_weigth_function import non_linear_weigth_function


class NNMF2d(torch.nn.Module):

    in_channels: int
    out_channels: int
    weight: torch.Tensor
    bias: None | torch.Tensor
    iterations: int
    epsilon: float | None
    init_min: float
    init_max: float
    beta: torch.Tensor | None
    positive_function_type: int
    local_learning: bool
    local_learning_kl: bool
    use_reconstruction: bool
    skip_connection: bool

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        device=None,
        dtype=None,
        iterations: int = 20,
        epsilon: float | None = None,
        init_min: float = 0.0,
        init_max: float = 1.0,
        beta: float | None = None,
        positive_function_type: int = 0,
        local_learning: bool = False,
        local_learning_kl: bool = False,
        use_reconstruction: bool = False,
        skip_connection: bool = False,
    ) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}

        super().__init__()

        self.positive_function_type = positive_function_type
        self.init_min = init_min
        self.init_max = init_max

        self.in_channels = in_channels
        self.out_channels = out_channels

        self.iterations = iterations
        self.local_learning = local_learning
        self.local_learning_kl = local_learning_kl

        self.use_reconstruction = use_reconstruction

        self.weight = torch.nn.parameter.Parameter(
            torch.empty((out_channels, in_channels), **factory_kwargs)
        )

        if beta is not None:
            self.beta = torch.nn.parameter.Parameter(torch.empty((1), **factory_kwargs))
            self.beta.data[0] = beta
        else:
            self.beta = None

        self.reset_parameters()
        self.functional_nnmf2d = FunctionalNNMF2d.apply

        self.epsilon = epsilon

        self.skip_connection = skip_connection

    def extra_repr(self) -> str:
        s: str = f"{self.in_channels}, {self.out_channels}"

        if self.epsilon is not None:
            s += f", epsilon={self.epsilon}"
        s += f", pfunctype={self.positive_function_type}"
        s += f", local_learning={self.local_learning}"

        if self.local_learning:
            s += f", local_learning_kl={self.local_learning_kl}"

        return s

    def reset_parameters(self) -> None:
        torch.nn.init.uniform_(self.weight, a=self.init_min, b=self.init_max)

    def forward(self, input: torch.Tensor) -> torch.Tensor:

        positive_weights = non_linear_weigth_function(
            self.weight, self.beta, self.positive_function_type
        )
        positive_weights = positive_weights / (
            positive_weights.sum(dim=1, keepdim=True) + 10e-20
        )

        h_dyn = self.functional_nnmf2d(
            input,
            positive_weights,
            self.out_channels,
            self.iterations,
            self.epsilon,
            self.local_learning,
            self.local_learning_kl,
        )
        # if self.skip_connection:
        #     if self.use_reconstruction:
        #         reconstruction = torch.nn.functional.linear(
        #             h_dyn.movedim(1, -1), positive_weights.T
        #         ).movedim(-1, 1)
        #         output = torch.cat((h_dyn, input - reconstruction), dim=1)
        #     else:
        #         output = torch.cat((h_dyn, input), dim=1)
        #     return output
        # else:
        #     return h_dyn
        return h_dyn


class FunctionalNNMF2d(torch.autograd.Function):
    @staticmethod
    def forward(  # type: ignore
        ctx,
        input: torch.Tensor,
        weight: torch.Tensor,
        out_channels: int,
        iterations: int,
        epsilon: float | None,
        local_learning: bool,
        local_learning_kl: bool,
    ) -> torch.Tensor:

        # Prepare h
        h = torch.full(
            (input.shape[0], out_channels, input.shape[-2], input.shape[-1]),
            1.0 / float(out_channels),
            device=input.device,
            dtype=input.dtype,
        )

        h = h.movedim(1, -1)
        input = input.movedim(1, -1)
        for _ in range(0, iterations):
            reconstruction = torch.nn.functional.linear(h, weight.T)
            reconstruction += 1e-20
            if epsilon is None:
                h *= torch.nn.functional.linear((input / reconstruction), weight)
            else:
                h *= 1 + epsilon * torch.nn.functional.linear(
                    (input / reconstruction), weight
                )
            h /= h.sum(-1, keepdim=True) + 10e-20
        h = h.movedim(-1, 1)
        input = input.movedim(-1, 1)

        # ###########################################################
        # Save the necessary data for the backward pass
        # ###########################################################
        ctx.save_for_backward(input, weight, h)
        ctx.local_learning = local_learning
        ctx.local_learning_kl = local_learning_kl

        assert torch.isfinite(h).all()
        return h

    @staticmethod
    @torch.autograd.function.once_differentiable
    def backward(ctx, grad_output: torch.Tensor) -> tuple[  # type: ignore
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
        None,
        None,
        None,
    ]:

        # ##############################################
        # Default values
        # ##############################################
        grad_input: torch.Tensor | None = None
        grad_weight: torch.Tensor | None = None

        # ##############################################
        # Get the variables back
        # ##############################################
        (input, weight, h) = ctx.saved_tensors

        # The back prop gradient
        h = h.movedim(1, -1)
        grad_output = grad_output.movedim(1, -1)
        input = input.movedim(1, -1)
        big_r = torch.nn.functional.linear(h, weight.T)
        big_r_div = 1.0 / (big_r + 1e-20)

        factor_x_div_r = input * big_r_div

        grad_input = torch.nn.functional.linear(h * grad_output, weight.T) * big_r_div

        del big_r_div

        # The weight gradient
        if ctx.local_learning is False:
            del big_r

            grad_weight = -torch.nn.functional.linear(
                h.reshape(
                    grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                    h.shape[3],
                ).T,
                (factor_x_div_r * grad_input)
                .reshape(
                    grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                    grad_input.shape[3],
                )
                .T,
            )

            grad_weight += torch.nn.functional.linear(
                (h * grad_output)
                .reshape(
                    grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                    h.shape[3],
                )
                .T,
                factor_x_div_r.reshape(
                    grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                    grad_input.shape[3],
                ).T,
            )

        else:
            if ctx.local_learning_kl:
                grad_weight = -torch.nn.functional.linear(
                    h.reshape(
                        grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                        h.shape[3],
                    ).T,
                    factor_x_div_r.reshape(
                        grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                        grad_input.shape[3],
                    ).T,
                )
            else:
                grad_weight = -torch.nn.functional.linear(
                    h.reshape(
                        grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                        h.shape[3],
                    ).T,
                    (2 * (input - big_r))
                    .reshape(
                        grad_input.shape[0] * grad_input.shape[1] * grad_input.shape[2],
                        grad_input.shape[3],
                    )
                    .T,
                )
        grad_input = grad_input.movedim(-1, 1)
        assert torch.isfinite(grad_input).all()
        assert torch.isfinite(grad_weight).all()

        return (
            grad_input,
            grad_weight,
            None,
            None,
            None,
            None,
            None,
        )