pytorch-sbs/DATA_CIFAR10/convert.py

# MIT License
# Copyright 2022 University of Bremen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# David Rotermund ( davrot@uni-bremen.de )
#
#
# Release history:
# ================
# 1.0.0 -- 01.05.2022: first release
#
#

import numpy as np
import pickle


def give_filenames(id: int) -> tuple[str, str, int]:
    if id == 0:
        start_id: int = 0
        prefix: str = "Test"
        filename: str = "cifar-10-batches-py/test_batch"
    if id == 1:
        start_id = 0
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_1"
    if id == 2:
        start_id = 10000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_2"
    if id == 3:
        start_id = 20000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_3"
    if id == 4:
        start_id = 30000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_4"
    if id == 5:
        start_id = 40000
        prefix = "Train"
        filename = "cifar-10-batches-py/data_batch_5"
    return filename, prefix, start_id


def load_data(filename: str) -> tuple[np.ndarray, np.ndarray]:
    fo = open(filename, "rb")
    dict_data = pickle.load(fo, encoding="bytes")
    _, labels_temp, data_temp, _ = dict_data.items()
    data: np.ndarray = np.array(data_temp[1])
    labels: np.ndarray = np.array(labels_temp[1])
    return data, labels


def split_into_three_color_channels(
    image: np.ndarray,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    channel_r = image[0:1024].astype(np.float32)
    channel_r = channel_r.reshape(32, 32)
    channel_g = image[1024:2048].astype(np.float32)
    channel_g = channel_g.reshape(32, 32)
    channel_b = image[2048:3072].astype(np.float32)
    channel_b = channel_b.reshape(32, 32)
    return channel_r, channel_g, channel_b


def process_data_set(test_data_mode: bool) -> None:

    if test_data_mode is True:
        filename_out_pattern: str = "TestPatternStorage.npy"
        filename_out_label: str = "TestLabelStorage.npy"
        number_of_pictures: int = 10000
        start_id: int = 0
        end_id: int = 0
    else:
        filename_out_pattern = "TrainPatternStorage.npy"
        filename_out_label = "TrainLabelStorage.npy"
        number_of_pictures = 50000
        start_id = 1
        end_id = 5

    np_data: np.ndarray = np.zeros((number_of_pictures, 32, 32, 3), dtype=np.float32)
    np_label: np.ndarray = np.zeros((number_of_pictures), dtype=np.uint64)

    for id in range(start_id, end_id + 1):
        filename, _, start_id_pattern = give_filenames(id)
        pictures, labels = load_data(filename)

        for i in range(0, pictures.shape[0]):
            channel_r, channel_g, channel_b = split_into_three_color_channels(
                pictures[i, :]
            )
            np_data[i + start_id_pattern, :, :, 0] = channel_r
            np_data[i + start_id_pattern, :, :, 1] = channel_g
            np_data[i + start_id_pattern, :, :, 2] = channel_b
            np_label[i + start_id_pattern] = labels[i]

    np_data /= np.max(np_data)

    label_storage: np.ndarray = np_label.astype(dtype=np.uint64)
    pattern_storage: np.ndarray = np_data.astype(dtype=np.float32)

    np.save(filename_out_pattern, pattern_storage)
    np.save(filename_out_label, label_storage)


process_data_set(True)
process_data_set(False)
Example data converter 2022-04-30 02:06:41 +02:00			`# MIT License`
			`# Copyright 2022 University of Bremen`
			`#`
			`# Permission is hereby granted, free of charge, to any person obtaining`
			`# a copy of this software and associated documentation files (the "Software"),`
			`# to deal in the Software without restriction, including without limitation`
			`# the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`# and/or sell copies of the Software, and to permit persons to whom the`
			`# Software is furnished to do so, subject to the following conditions:`
			`#`
			`# The above copyright notice and this permission notice shall be included`
			`# in all copies or substantial portions of the Software.`
			`#`
			`# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,`
			`# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,`
			`# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR`
			`# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR`
			`# THE USE OR OTHER DEALINGS IN THE SOFTWARE.`
			`#`
			`#`
			`# David Rotermund ( davrot@uni-bremen.de )`
			`#`
			`#`
			`# Release history:`
			`# ================`
			`# 1.0.0 -- 01.05.2022: first release`
			`#`
			`#`

			`import numpy as np`
			`import pickle`


			`def give_filenames(id: int) -> tuple[str, str, int]:`
			`if id == 0:`
			`start_id: int = 0`
			`prefix: str = "Test"`
			`filename: str = "cifar-10-batches-py/test_batch"`
			`if id == 1:`
			`start_id = 0`
			`prefix = "Train"`
			`filename = "cifar-10-batches-py/data_batch_1"`
			`if id == 2:`
			`start_id = 10000`
			`prefix = "Train"`
			`filename = "cifar-10-batches-py/data_batch_2"`
			`if id == 3:`
			`start_id = 20000`
			`prefix = "Train"`
			`filename = "cifar-10-batches-py/data_batch_3"`
			`if id == 4:`
			`start_id = 30000`
			`prefix = "Train"`
			`filename = "cifar-10-batches-py/data_batch_4"`
			`if id == 5:`
			`start_id = 40000`
			`prefix = "Train"`
			`filename = "cifar-10-batches-py/data_batch_5"`
			`return filename, prefix, start_id`


			`def load_data(filename: str) -> tuple[np.ndarray, np.ndarray]:`
			`fo = open(filename, "rb")`
			`dict_data = pickle.load(fo, encoding="bytes")`
			`_, labels_temp, data_temp, _ = dict_data.items()`
			`data: np.ndarray = np.array(data_temp[1])`
			`labels: np.ndarray = np.array(labels_temp[1])`
			`return data, labels`


			`def split_into_three_color_channels(`
			`image: np.ndarray,`
			`) -> tuple[np.ndarray, np.ndarray, np.ndarray]:`
			`channel_r = image[0:1024].astype(np.float32)`
			`channel_r = channel_r.reshape(32, 32)`
			`channel_g = image[1024:2048].astype(np.float32)`
			`channel_g = channel_g.reshape(32, 32)`
			`channel_b = image[2048:3072].astype(np.float32)`
			`channel_b = channel_b.reshape(32, 32)`
			`return channel_r, channel_g, channel_b`


			`def process_data_set(test_data_mode: bool) -> None:`

			`if test_data_mode is True:`
			`filename_out_pattern: str = "TestPatternStorage.npy"`
			`filename_out_label: str = "TestLabelStorage.npy"`
			`number_of_pictures: int = 10000`
			`start_id: int = 0`
			`end_id: int = 0`
			`else:`
			`filename_out_pattern = "TrainPatternStorage.npy"`
			`filename_out_label = "TrainLabelStorage.npy"`
			`number_of_pictures = 50000`
			`start_id = 1`
			`end_id = 5`

			`np_data: np.ndarray = np.zeros((number_of_pictures, 32, 32, 3), dtype=np.float32)`
			`np_label: np.ndarray = np.zeros((number_of_pictures), dtype=np.uint64)`

			`for id in range(start_id, end_id + 1):`
			`filename, _, start_id_pattern = give_filenames(id)`
			`pictures, labels = load_data(filename)`

			`for i in range(0, pictures.shape[0]):`
			`channel_r, channel_g, channel_b = split_into_three_color_channels(`
			`pictures[i, :]`
			`)`
			`np_data[i + start_id_pattern, :, :, 0] = channel_r`
			`np_data[i + start_id_pattern, :, :, 1] = channel_g`
			`np_data[i + start_id_pattern, :, :, 2] = channel_b`
			`np_label[i + start_id_pattern] = labels[i]`

			`np_data /= np.max(np_data)`

			`label_storage: np.ndarray = np_label.astype(dtype=np.uint64)`
			`pattern_storage: np.ndarray = np_data.astype(dtype=np.float32)`

			`np.save(filename_out_pattern, pattern_storage)`
			`np.save(filename_out_label, label_storage)`


			`process_data_set(True)`
			`process_data_set(False)`