From 6975d84087553d2f4ef0a2dc166a460ff4616aeb Mon Sep 17 00:00:00 2001 From: David Rotermund <54365609+davrot@users.noreply.github.com> Date: Sun, 15 Jan 2023 00:53:58 +0100 Subject: [PATCH] Add files via upload --- dataset_collection/DATA_CIFAR10/convert.py | 126 ++++++++++++++ dataset_collection/DATA_CIFAR10/data_url.txt | 8 + dataset_collection/DATA_CIFAR10/dataset.json | 4 + .../DATA_FASHION_MNIST/convert.py | 161 ++++++++++++++++++ .../DATA_FASHION_MNIST/data_url.txt | 8 + .../DATA_FASHION_MNIST/dataset.json | 4 + dataset_collection/DATA_MNIST/convert.py | 161 ++++++++++++++++++ dataset_collection/DATA_MNIST/data_url.txt | 8 + dataset_collection/DATA_MNIST/dataset.json | 4 + 9 files changed, 484 insertions(+) create mode 100644 dataset_collection/DATA_CIFAR10/convert.py create mode 100644 dataset_collection/DATA_CIFAR10/data_url.txt create mode 100644 dataset_collection/DATA_CIFAR10/dataset.json create mode 100644 dataset_collection/DATA_FASHION_MNIST/convert.py create mode 100644 dataset_collection/DATA_FASHION_MNIST/data_url.txt create mode 100644 dataset_collection/DATA_FASHION_MNIST/dataset.json create mode 100644 dataset_collection/DATA_MNIST/convert.py create mode 100644 dataset_collection/DATA_MNIST/data_url.txt create mode 100644 dataset_collection/DATA_MNIST/dataset.json diff --git a/dataset_collection/DATA_CIFAR10/convert.py b/dataset_collection/DATA_CIFAR10/convert.py new file mode 100644 index 0000000..badbfa0 --- /dev/null +++ b/dataset_collection/DATA_CIFAR10/convert.py @@ -0,0 +1,126 @@ +# MIT License +# Copyright 2022 University of Bremen +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +# THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# +# David Rotermund ( davrot@uni-bremen.de ) +# +# +# Release history: +# ================ +# 1.0.0 -- 01.05.2022: first release +# +# + +import numpy as np +import pickle + + +def give_filenames(id: int) -> tuple[str, str, int]: + if id == 0: + start_id: int = 0 + prefix: str = "Test" + filename: str = "cifar-10-batches-py/test_batch" + if id == 1: + start_id = 0 + prefix = "Train" + filename = "cifar-10-batches-py/data_batch_1" + if id == 2: + start_id = 10000 + prefix = "Train" + filename = "cifar-10-batches-py/data_batch_2" + if id == 3: + start_id = 20000 + prefix = "Train" + filename = "cifar-10-batches-py/data_batch_3" + if id == 4: + start_id = 30000 + prefix = "Train" + filename = "cifar-10-batches-py/data_batch_4" + if id == 5: + start_id = 40000 + prefix = "Train" + filename = "cifar-10-batches-py/data_batch_5" + return filename, prefix, start_id + + +def load_data(filename: str) -> tuple[np.ndarray, np.ndarray]: + fo = open(filename, "rb") + dict_data = pickle.load(fo, encoding="bytes") + _, labels_temp, data_temp, _ = dict_data.items() + data: np.ndarray = np.array(data_temp[1]) + labels: np.ndarray = np.array(labels_temp[1]) + return data, labels + + +def split_into_three_color_channels( + image: np.ndarray, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + channel_r = image[0:1024].astype(np.float32) + channel_r = channel_r.reshape(32, 32) + channel_g = image[1024:2048].astype(np.float32) + channel_g = channel_g.reshape(32, 32) + channel_b = image[2048:3072].astype(np.float32) + channel_b = channel_b.reshape(32, 32) + return channel_r, channel_g, channel_b + + +def process_data_set(test_data_mode: bool) -> None: + + if test_data_mode is True: + filename_out_pattern: str = "TestPatternStorage.npy" + filename_out_label: str = "TestLabelStorage.npy" + number_of_pictures: int = 10000 + start_id: int = 0 + end_id: int = 0 + else: + filename_out_pattern = "TrainPatternStorage.npy" + filename_out_label = "TrainLabelStorage.npy" + number_of_pictures = 50000 + start_id = 1 + end_id = 5 + + np_data: np.ndarray = np.zeros((number_of_pictures, 32, 32, 3), dtype=np.float32) + np_label: np.ndarray = np.zeros((number_of_pictures), dtype=np.uint64) + + for id in range(start_id, end_id + 1): + filename, _, start_id_pattern = give_filenames(id) + pictures, labels = load_data(filename) + + for i in range(0, pictures.shape[0]): + channel_r, channel_g, channel_b = split_into_three_color_channels( + pictures[i, :] + ) + np_data[i + start_id_pattern, :, :, 0] = channel_r + np_data[i + start_id_pattern, :, :, 1] = channel_g + np_data[i + start_id_pattern, :, :, 2] = channel_b + np_label[i + start_id_pattern] = labels[i] + + np_data /= np.max(np_data) + + label_storage: np.ndarray = np_label.astype(dtype=np.uint64) + pattern_storage: np.ndarray = np_data.astype(dtype=np.float32) + + np.save(filename_out_pattern, pattern_storage) + np.save(filename_out_label, label_storage) + + +process_data_set(True) +process_data_set(False) diff --git a/dataset_collection/DATA_CIFAR10/data_url.txt b/dataset_collection/DATA_CIFAR10/data_url.txt new file mode 100644 index 0000000..bcc1a82 --- /dev/null +++ b/dataset_collection/DATA_CIFAR10/data_url.txt @@ -0,0 +1,8 @@ +https://www.cs.toronto.edu/~kriz/cifar.html + +Download the CIFAR-10 python version +https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz + +Then +tar -xvzf cifar-10-python.tar.gz +python convert.py diff --git a/dataset_collection/DATA_CIFAR10/dataset.json b/dataset_collection/DATA_CIFAR10/dataset.json new file mode 100644 index 0000000..01eb008 --- /dev/null +++ b/dataset_collection/DATA_CIFAR10/dataset.json @@ -0,0 +1,4 @@ +{ + "data_path": "./DATA_CIFAR10/", + "data_mode": "CIFAR10" +} diff --git a/dataset_collection/DATA_FASHION_MNIST/convert.py b/dataset_collection/DATA_FASHION_MNIST/convert.py new file mode 100644 index 0000000..dc2e15b --- /dev/null +++ b/dataset_collection/DATA_FASHION_MNIST/convert.py @@ -0,0 +1,161 @@ +# MIT License +# Copyright 2022 University of Bremen +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +# THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# +# David Rotermund ( davrot@uni-bremen.de ) +# +# +# Release history: +# ================ +# 1.0.0 -- 01.05.2022: first release +# +# + +import numpy as np + +# [offset] [type] [value] [description] +# 0000 32 bit integer 0x00000801(2049) magic number (MSB first) +# 0004 32 bit integer 60000 number of items +# 0008 unsigned byte ?? label +# 0009 unsigned byte ?? label +# ........ +# xxxx unsigned byte ?? label +# The labels values are 0 to 9. + + +class ReadLabel: + """Class for reading the labels from an MNIST label file""" + + def __init__(self, filename): + self.filename: str = filename + self.data = self.read_from_file(filename) + + def read_from_file(self, filename): + int32_data = np.dtype(np.uint32) + int32_data = int32_data.newbyteorder(">") + file = open(filename, "rb") + + magic_flag = np.frombuffer(file.read(4), int32_data)[0] + + if magic_flag != 2049: + data = np.zeros(0) + number_of_elements = 0 + else: + number_of_elements = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_elements < 1: + data = np.zeros(0) + else: + data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8) + + file.close() + + return data + + +# [offset] [type] [value] [description] +# 0000 32 bit integer 0x00000803(2051) magic number +# 0004 32 bit integer 60000 number of images +# 0008 32 bit integer 28 number of rows +# 0012 32 bit integer 28 number of columns +# 0016 unsigned byte ?? pixel +# 0017 unsigned byte ?? pixel +# ........ +# xxxx unsigned byte ?? pixel +# Pixels are organized row-wise. +# Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black). + + +class ReadPicture: + """Class for reading the images from an MNIST image file""" + + def __init__(self, filename): + self.filename: str = filename + self.data = self.read_from_file(filename) + + def read_from_file(self, filename): + int32_data = np.dtype(np.uint32) + int32_data = int32_data.newbyteorder(">") + file = open(filename, "rb") + + magic_flag = np.frombuffer(file.read(4), int32_data)[0] + + if magic_flag != 2051: + data = np.zeros(0) + number_of_elements = 0 + else: + number_of_elements = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_elements < 1: + data = np.zeros(0) + number_of_rows = 0 + else: + number_of_rows = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_rows != 28: + data = np.zeros(0) + number_of_columns = 0 + else: + number_of_columns = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_columns != 28: + data = np.zeros(0) + else: + data = np.frombuffer( + file.read(number_of_elements * number_of_rows * number_of_columns), + dtype=np.uint8, + ) + data = data.reshape(number_of_elements, number_of_columns, number_of_rows) + + file.close() + + return data + + +def proprocess_data_set(test_mode): + + if test_mode is True: + filename_out_pattern: str = "TestPatternStorage.npy" + filename_out_label: str = "TestLabelStorage.npy" + filename_in_image: str = "t10k-images-idx3-ubyte" + filename_in_label = "t10k-labels-idx1-ubyte" + else: + filename_out_pattern = "TrainPatternStorage.npy" + filename_out_label = "TrainLabelStorage.npy" + filename_in_image = "train-images-idx3-ubyte" + filename_in_label = "train-labels-idx1-ubyte" + + pictures = ReadPicture(filename_in_image) + labels = ReadLabel(filename_in_label) + + # Down to 0 ... 1.0 + max_value = np.max(pictures.data.astype(np.float32)) + d = np.float32(pictures.data.astype(np.float32) / max_value) + + label_storage = np.uint64(labels.data) + pattern_storage = d.astype(np.float32) + + np.save(filename_out_pattern, pattern_storage) + np.save(filename_out_label, label_storage) + + +proprocess_data_set(True) +proprocess_data_set(False) diff --git a/dataset_collection/DATA_FASHION_MNIST/data_url.txt b/dataset_collection/DATA_FASHION_MNIST/data_url.txt new file mode 100644 index 0000000..58ff44e --- /dev/null +++ b/dataset_collection/DATA_FASHION_MNIST/data_url.txt @@ -0,0 +1,8 @@ +https://github.com/zalandoresearch/fashion-mnist + +We need: +t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz + +Then +gzip -d *.gz +python convert.py diff --git a/dataset_collection/DATA_FASHION_MNIST/dataset.json b/dataset_collection/DATA_FASHION_MNIST/dataset.json new file mode 100644 index 0000000..76d5c06 --- /dev/null +++ b/dataset_collection/DATA_FASHION_MNIST/dataset.json @@ -0,0 +1,4 @@ +{ + "data_path": "./DATA_FASHION_MNIST/", + "data_mode": "MNIST_FASHION" +} diff --git a/dataset_collection/DATA_MNIST/convert.py b/dataset_collection/DATA_MNIST/convert.py new file mode 100644 index 0000000..dc2e15b --- /dev/null +++ b/dataset_collection/DATA_MNIST/convert.py @@ -0,0 +1,161 @@ +# MIT License +# Copyright 2022 University of Bremen +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +# THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# +# David Rotermund ( davrot@uni-bremen.de ) +# +# +# Release history: +# ================ +# 1.0.0 -- 01.05.2022: first release +# +# + +import numpy as np + +# [offset] [type] [value] [description] +# 0000 32 bit integer 0x00000801(2049) magic number (MSB first) +# 0004 32 bit integer 60000 number of items +# 0008 unsigned byte ?? label +# 0009 unsigned byte ?? label +# ........ +# xxxx unsigned byte ?? label +# The labels values are 0 to 9. + + +class ReadLabel: + """Class for reading the labels from an MNIST label file""" + + def __init__(self, filename): + self.filename: str = filename + self.data = self.read_from_file(filename) + + def read_from_file(self, filename): + int32_data = np.dtype(np.uint32) + int32_data = int32_data.newbyteorder(">") + file = open(filename, "rb") + + magic_flag = np.frombuffer(file.read(4), int32_data)[0] + + if magic_flag != 2049: + data = np.zeros(0) + number_of_elements = 0 + else: + number_of_elements = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_elements < 1: + data = np.zeros(0) + else: + data = np.frombuffer(file.read(number_of_elements), dtype=np.uint8) + + file.close() + + return data + + +# [offset] [type] [value] [description] +# 0000 32 bit integer 0x00000803(2051) magic number +# 0004 32 bit integer 60000 number of images +# 0008 32 bit integer 28 number of rows +# 0012 32 bit integer 28 number of columns +# 0016 unsigned byte ?? pixel +# 0017 unsigned byte ?? pixel +# ........ +# xxxx unsigned byte ?? pixel +# Pixels are organized row-wise. +# Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black). + + +class ReadPicture: + """Class for reading the images from an MNIST image file""" + + def __init__(self, filename): + self.filename: str = filename + self.data = self.read_from_file(filename) + + def read_from_file(self, filename): + int32_data = np.dtype(np.uint32) + int32_data = int32_data.newbyteorder(">") + file = open(filename, "rb") + + magic_flag = np.frombuffer(file.read(4), int32_data)[0] + + if magic_flag != 2051: + data = np.zeros(0) + number_of_elements = 0 + else: + number_of_elements = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_elements < 1: + data = np.zeros(0) + number_of_rows = 0 + else: + number_of_rows = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_rows != 28: + data = np.zeros(0) + number_of_columns = 0 + else: + number_of_columns = np.frombuffer(file.read(4), int32_data)[0] + + if number_of_columns != 28: + data = np.zeros(0) + else: + data = np.frombuffer( + file.read(number_of_elements * number_of_rows * number_of_columns), + dtype=np.uint8, + ) + data = data.reshape(number_of_elements, number_of_columns, number_of_rows) + + file.close() + + return data + + +def proprocess_data_set(test_mode): + + if test_mode is True: + filename_out_pattern: str = "TestPatternStorage.npy" + filename_out_label: str = "TestLabelStorage.npy" + filename_in_image: str = "t10k-images-idx3-ubyte" + filename_in_label = "t10k-labels-idx1-ubyte" + else: + filename_out_pattern = "TrainPatternStorage.npy" + filename_out_label = "TrainLabelStorage.npy" + filename_in_image = "train-images-idx3-ubyte" + filename_in_label = "train-labels-idx1-ubyte" + + pictures = ReadPicture(filename_in_image) + labels = ReadLabel(filename_in_label) + + # Down to 0 ... 1.0 + max_value = np.max(pictures.data.astype(np.float32)) + d = np.float32(pictures.data.astype(np.float32) / max_value) + + label_storage = np.uint64(labels.data) + pattern_storage = d.astype(np.float32) + + np.save(filename_out_pattern, pattern_storage) + np.save(filename_out_label, label_storage) + + +proprocess_data_set(True) +proprocess_data_set(False) diff --git a/dataset_collection/DATA_MNIST/data_url.txt b/dataset_collection/DATA_MNIST/data_url.txt new file mode 100644 index 0000000..7a6b872 --- /dev/null +++ b/dataset_collection/DATA_MNIST/data_url.txt @@ -0,0 +1,8 @@ +http://yann.lecun.com/exdb/mnist/ + +We need: +t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz + +Then +gzip -d *.gz +python convert.py diff --git a/dataset_collection/DATA_MNIST/dataset.json b/dataset_collection/DATA_MNIST/dataset.json new file mode 100644 index 0000000..7f74d48 --- /dev/null +++ b/dataset_collection/DATA_MNIST/dataset.json @@ -0,0 +1,4 @@ +{ + "data_path": "./DATA_MNIST/", + "data_mode": "MNIST" +}