From 35dd8fafaf12a95c92c4234c24dce3a788112574 Mon Sep 17 00:00:00 2001 From: David Rotermund <54365609+davrot@users.noreply.github.com> Date: Tue, 2 Jan 2024 21:53:57 +0100 Subject: [PATCH] Update README.md Signed-off-by: David Rotermund <54365609+davrot@users.noreply.github.com> --- pytorch/train/README.md | 287 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 280 insertions(+), 7 deletions(-) diff --git a/pytorch/train/README.md b/pytorch/train/README.md index 82e8131..7a3a9e2 100644 --- a/pytorch/train/README.md +++ b/pytorch/train/README.md @@ -284,13 +284,13 @@ Operations you will see that are not explained yet: ||| |---|---| -|[network.train()](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.train)| : "Sets the module in training mode."| -|[optimizer.zero_grad()](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html)| : "Sets the gradients of all optimized [torch.Tensor](https://pytorch.org/docs/stable/tensors.html#torch.Tensor)s to zero." For every mini batch we (need to) clean the gradient which is used for training the parameters. | -|[optimizer.step()](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.step.html#torch.optim.Optimizer.step)| : "Performs a single optimization step (parameter update)."| -|[loss.backward()](https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html)| : "Computes the gradient of current tensor w.r.t. graph leaves."| -|[lr_scheduler.step(train_loss)](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html#torch.optim.lr_scheduler.ReduceLROnPlateau)| : After an epoch the learning rate (might be) changed. For other [Learning rate scheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) .step() might have no parameter.| -|[network.eval()](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.eval)| : "Sets the module in evaluation mode."| -|[with torch.no_grad():](https://pytorch.org/docs/stable/generated/torch.no_grad.html)| : "Context-manager that disabled gradient calculation."| +|[network.train()](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.train)| "Sets the module in training mode."| +|[optimizer.zero_grad()](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html)| "Sets the gradients of all optimized [torch.Tensor](https://pytorch.org/docs/stable/tensors.html#torch.Tensor)s to zero." For every mini batch we (need to) clean the gradient which is used for training the parameters. | +|[optimizer.step()](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.step.html#torch.optim.Optimizer.step)| "Performs a single optimization step (parameter update)."| +|[loss.backward()](https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html)| "Computes the gradient of current tensor w.r.t. graph leaves."| +|[lr_scheduler.step(train_loss)](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html#torch.optim.lr_scheduler.ReduceLROnPlateau)| After an epoch the learning rate (might be) changed. For other [Learning rate scheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) .step() might have no parameter.| +|[network.eval()](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.eval)| "Sets the module in evaluation mode."| +|[with torch.no_grad():](https://pytorch.org/docs/stable/generated/torch.no_grad.html)| "Context-manager that disabled gradient calculation."| ```python @@ -498,6 +498,279 @@ for epoch_id in range(0, number_of_epoch): tb.close() ``` +Output: + +```python +Epoch: 0 +Training: Loss=1029.10439 Correct=75.78% +Testing: Correct=88.61% +Time: Training=8.6sec, Testing=0.6sec + +Epoch: 1 +Training: Loss=959.81828 Correct=86.48% +Testing: Correct=89.26% +Time: Training=8.1sec, Testing=0.5sec + +[...] + +Epoch: 48 +Training: Loss=881.60049 Correct=99.20% +Testing: Correct=99.04% +Time: Training=9.2sec, Testing=0.5sec + +Epoch: 49 +Training: Loss=881.40331 Correct=99.23% +Testing: Correct=99.26% +Time: Training=9.4sec, Testing=0.4sec +``` + +## MNIST with Adam, ReduceLROnPlateau, cross-entropy on GPU + +Here a list of the changes: + +Added to the beginning + +```python +assert torch.cuda.is_available() is True +device_gpu = torch.device("cuda:0") +``` + +Network after its creating moved to the GPU + +```python +network = torch.nn.Sequential([...]).to(device=device_gpu) +``` + +During training + +```python + output = network(train_processing_chain(image).to(device=device_gpu)) + + loss = loss_function(output, target.to(device_gpu)) + + train_loss += loss.item() + train_correct += (output.argmax(dim=1).cpu() == target).sum().numpy() +``` + + During testing + +```python + output = network(test_processing_chain(image).to(device=device_gpu)) + + test_correct += (output.argmax(dim=1).cpu() == target).sum().numpy() +``` + +Full source code: + +```python +import os + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + +import torch +import torchvision # type:ignore +import numpy as np +import time +from torch.utils.tensorboard import SummaryWriter + +assert torch.cuda.is_available() is True +device_gpu = torch.device("cuda:0") + + +class MyDataset(torch.utils.data.Dataset): + # Initialize + def __init__(self, train: bool = False) -> None: + super(MyDataset, self).__init__() + + if train is True: + self.pattern_storage: np.ndarray = np.load("train_pattern_storage.npy") + self.label_storage: np.ndarray = np.load("train_label_storage.npy") + else: + self.pattern_storage = np.load("test_pattern_storage.npy") + self.label_storage = np.load("test_label_storage.npy") + + self.pattern_storage = self.pattern_storage.astype(np.float32) + self.pattern_storage /= np.max(self.pattern_storage) + + # How many pattern are there? + self.number_of_pattern: int = self.label_storage.shape[0] + + def __len__(self) -> int: + return self.number_of_pattern + + # Get one pattern at position index + def __getitem__(self, index: int) -> tuple[torch.Tensor, int]: + image = torch.tensor(self.pattern_storage[index, np.newaxis, :, :]) + target = int(self.label_storage[index]) + + return image, target + + +# Some parameters +input_number_of_channel: int = 1 +input_dim_x: int = 24 +input_dim_y: int = 24 + +number_of_output_channels_conv1: int = 32 +number_of_output_channels_conv2: int = 64 +number_of_output_channels_flatten1: int = 576 +number_of_output_channels_full1: int = 10 + +kernel_size_conv1: tuple[int, int] = (5, 5) +kernel_size_pool1: tuple[int, int] = (2, 2) +kernel_size_conv2: tuple[int, int] = (5, 5) +kernel_size_pool2: tuple[int, int] = (2, 2) + +stride_conv1: tuple[int, int] = (1, 1) +stride_pool1: tuple[int, int] = (2, 2) +stride_conv2: tuple[int, int] = (1, 1) +stride_pool2: tuple[int, int] = (2, 2) + +padding_conv1: int = 0 +padding_pool1: int = 0 +padding_conv2: int = 0 +padding_pool2: int = 0 + +network = torch.nn.Sequential( + torch.nn.Conv2d( + in_channels=input_number_of_channel, + out_channels=number_of_output_channels_conv1, + kernel_size=kernel_size_conv1, + stride=stride_conv1, + padding=padding_conv1, + ), + torch.nn.ReLU(), + torch.nn.MaxPool2d( + kernel_size=kernel_size_pool1, stride=stride_pool1, padding=padding_pool1 + ), + torch.nn.Conv2d( + in_channels=number_of_output_channels_conv1, + out_channels=number_of_output_channels_conv2, + kernel_size=kernel_size_conv2, + stride=stride_conv2, + padding=padding_conv2, + ), + torch.nn.ReLU(), + torch.nn.MaxPool2d( + kernel_size=kernel_size_pool2, stride=stride_pool2, padding=padding_pool2 + ), + torch.nn.Flatten( + start_dim=1, + ), + torch.nn.Linear( + in_features=number_of_output_channels_flatten1, + out_features=number_of_output_channels_full1, + bias=True, + ), + torch.nn.Softmax(dim=1), +).to(device=device_gpu) + +test_processing_chain = torchvision.transforms.Compose( + transforms=[torchvision.transforms.CenterCrop((24, 24))], +) + +train_processing_chain = torchvision.transforms.Compose( + transforms=[torchvision.transforms.RandomCrop((24, 24))], +) + +dataset_train = MyDataset(train=True) +dataset_test = MyDataset(train=False) +batch_size_train = 100 +batch_size_test = 100 + +train_data_load = torch.utils.data.DataLoader( + dataset_train, batch_size=batch_size_train, shuffle=True +) + +test_data_load = torch.utils.data.DataLoader( + dataset_test, batch_size=batch_size_test, shuffle=False +) + +# ------------------------------------------- + +# The optimizer +optimizer = torch.optim.Adam(network.parameters(), lr=0.001) + +# The LR Scheduler +lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) + +number_of_test_pattern: int = dataset_test.__len__() +number_of_train_pattern: int = dataset_train.__len__() + +number_of_epoch: int = 50 + +tb = SummaryWriter(log_dir="run") + +loss_function = torch.nn.CrossEntropyLoss() + +for epoch_id in range(0, number_of_epoch): + print(f"Epoch: {epoch_id}") + t_start: float = time.perf_counter() + + train_loss: float = 0.0 + train_correct: int = 0 + train_number: int = 0 + test_correct: int = 0 + test_number: int = 0 + + # Switch the network into training mode + network.train() + + # This runs in total for one epoch split up into mini-batches + for image, target in train_data_load: + # Clean the gradient + optimizer.zero_grad() + + output = network(train_processing_chain(image).to(device=device_gpu)) + + loss = loss_function(output, target.to(device=device_gpu)) + + train_loss += loss.item() + train_correct += (output.argmax(dim=1).cpu() == target).sum().numpy() + train_number += target.shape[0] + # Calculate backprop + loss.backward() + + # Update the parameter + optimizer.step() + + # Update the learning rate + lr_scheduler.step(train_loss) + + t_training: float = time.perf_counter() + + # Switch the network into evalution mode + network.eval() + with torch.no_grad(): + for image, target in test_data_load: + output = network(test_processing_chain(image).to(device=device_gpu)) + + test_correct += (output.argmax(dim=1).cpu() == target).sum().numpy() + test_number += target.shape[0] + + t_testing = time.perf_counter() + + perfomance_test_correct: float = 100.0 * test_correct / test_number + perfomance_train_correct: float = 100.0 * train_correct / train_number + + tb.add_scalar("Train Loss", train_loss, epoch_id) + tb.add_scalar("Train Number Correct", train_correct, epoch_id) + tb.add_scalar("Test Number Correct", test_correct, epoch_id) + + print(f"Training: Loss={train_loss:.5f} Correct={perfomance_train_correct:.2f}%") + print(f"Testing: Correct={perfomance_test_correct:.2f}%") + print( + f"Time: Training={(t_training-t_start):.1f}sec, Testing={(t_testing-t_training):.1f}sec" + ) + torch.save(network, "Model_MNIST_A_" + str(epoch_id) + ".pt") + print() + + tb.flush() + +tb.close() +``` + + ## Mean square error You might be inclined to use the MSE instead of the cross entropy.