diff options
Diffstat (limited to 'tools/generate_datafile')
5 files changed, 306 insertions, 0 deletions
diff --git a/tools/generate_datafile/tf_dataset_converter/README.md b/tools/generate_datafile/tf_dataset_converter/README.md new file mode 100644 index 000000000..3d4612520 --- /dev/null +++ b/tools/generate_datafile/tf_dataset_converter/README.md @@ -0,0 +1,66 @@ +# tf dataset converter + +## What is tf dataset converter? + +_tf dataset converter_ is a tool which converts tensorflow datasets to datasets for `onert_train`. + +## Possible datasets +- Tensorflow datasets with [ClassLabel feature](https://www.tensorflow.org/datasets/api_docs/python/tfds/features/ClassLabel) + +## Prerequisite +- Python 3.8 (python3.8, python3.8-dev packages) +- Python packages required + +## Usage +usage: main.py [-h] [-s] [-d Dataset] [-o Dir] [-p Prefix] [-l N] [-t N] + +Convert a dataset of tensorflow to onert format + +options: + -h, --help show this help message and exit + -s, --show-datasets show dataset list + -d Dataset, --dataset-name Dataset + name of dataset to be converted (default: "fashion_mnist") + -o Dir, --out-dir Dir + relative path of the files to be created (default: "out") + -p Prefix, --prefix-name Prefix + prefix name of the file to be created (default: "") + -l N, --train-length N + Number of data for training (default: 1000) + -t N, --test-length N + Number of data for training (default: 100) + +## Example +### Install required packages +``` +$ python3 -m pip install -r requirements.txt +``` + +### Show dataset list +``` +$ python3 main.py --show-datasets +Dataset list : +[abstract_reasoning, +accentdb, +... +fashion_mnist, +... +robotics:mt_opt_sd] +``` + +### Convert dataset to onert format +``` +$ python3 main.py \ + --dataset-name fashion_mnist \ + --prefix-name fashion-mnist \ + --train-length 2000 \ + --test-length 200 +``` +``` +$ tree out +out +├── fashion-mnist.test.input.200.bin +├── fashion-mnist.test.output.200.bin +├── fashion-mnist.train.input.2000.bin +└── fashion-mnist.train.output.2000.bin +``` diff --git a/tools/generate_datafile/tf_dataset_converter/argparser.py b/tools/generate_datafile/tf_dataset_converter/argparser.py new file mode 100644 index 000000000..daa7b5f07 --- /dev/null +++ b/tools/generate_datafile/tf_dataset_converter/argparser.py @@ -0,0 +1,54 @@ +'''Parse arguments''' + +import argparse + + +def _create_parser(): + parser = argparse.ArgumentParser( + description='Convert a dataset of tensorflow to onert format') + parser.add_argument( + '-s', '--show-datasets', action='store_true', help='show dataset list') + parser.add_argument( + '-d', + '--dataset-name', + type=str, + default='fashion_mnist', + metavar='Dataset', + help='name of dataset to be converted (default: "fashion_mnist")') + parser.add_argument( + '-o', + '--out-dir', + type=str, + default='out', + metavar='Dir', + help='relative path of the files to be created (default: "out")') + parser.add_argument( + '-p', + '--prefix-name', + type=str, + default='', + metavar='Prefix', + help='prefix name of the file to be created (default: "")') + parser.add_argument( + '-l', + '--train-length', + type=int, + default=1000, + metavar='N', + help='Number of data for training (default: 1000)') + parser.add_argument( + '-t', + '--test-length', + type=int, + default=100, + metavar='N', + help='Number of data for training (default: 100)') + + return parser + + +def parse_args(): + parser = _create_parser() + args = parser.parse_args() + + return args diff --git a/tools/generate_datafile/tf_dataset_converter/datasets.py b/tools/generate_datafile/tf_dataset_converter/datasets.py new file mode 100644 index 000000000..d63320055 --- /dev/null +++ b/tools/generate_datafile/tf_dataset_converter/datasets.py @@ -0,0 +1,80 @@ +'''Deal with the tensorflow dataset.''' + +import tensorflow as tf +import tensorflow_datasets as tfds +from pathlib import Path + +dataset_root_dir = Path(__file__).parent.absolute() / 'data' + + +class DatasetLoader(): + ''' + Loader of tensorflow datasets + ''' + + def load(self, dataset_name): + (ds_train, ds_test), ds_info = tfds.load( + dataset_name, + split=['train', 'test'], + data_dir=dataset_root_dir, + shuffle_files=True, + as_supervised=True, + with_info=True, + ) + + self.ds_info = ds_info + + def _normalize_img(image, label): + """Normalizes images: `uint8` -> `float32`.""" + return tf.cast(image, tf.float32) / 255., label + + self.ds_train = ds_train.map(_normalize_img) + self.ds_test = ds_test.map(_normalize_img) + + for images, labels in self.ds_train: + print(f'Shape of images : {images.shape}') + print(f'Shape of labels: {labels.shape} {labels.dtype}') + break + + def get_dataset_names(self): + return tfds.list_builders() + + def class_names(self): + ''' + Get class names + ''' + return self.ds_info.features['label'].names + + def num_classes(self): + ''' + Get the number of classes + ''' + return self.ds_info.features['label'].num_classes + + def get_num_train_examples(self): + ''' + Get examples for training + ''' + return self.ds_info.splits['train'].num_examples + + def get_num_test_examples(self): + ''' + Get examples for testing + ''' + return self.ds_info.splits['test'].num_examples + + def prefetched_datasets(self): + ''' + get prefetched datasets for traning. + + Return: + Datasets for training and testing. + ''' + + train_dataset = self.ds_train.cache() + train_dataset = train_dataset.shuffle(self.ds_info.splits['train'].num_examples) + + test_dataset = self.ds_train.cache() + + # return train_dataset, test_dataset + return self.ds_train.cache(), self.ds_test.cache() diff --git a/tools/generate_datafile/tf_dataset_converter/main.py b/tools/generate_datafile/tf_dataset_converter/main.py new file mode 100644 index 000000000..77e339965 --- /dev/null +++ b/tools/generate_datafile/tf_dataset_converter/main.py @@ -0,0 +1,98 @@ +################################################################################ +# Parse arguments +################################################################################ + +from argparser import parse_args + +# You can see arguments' information in argparser.py +args = parse_args() + +################################################################################ +# Load a dataset of tensorflow +################################################################################ + +# Disable tensorflow cpp warning log +import os + +FILTERING_WARNING = '2' +os.environ['TF_CPP_MIN_LOG_LEVEL'] = FILTERING_WARNING + +from datasets import DatasetLoader +from pathlib import Path +import tensorflow as tf +import numpy as np + +ds_loader = DatasetLoader() + +if args.show_datasets: + print('Dataset list :') + names = ',\n'.join(ds_loader.get_dataset_names()) + print(f'[{names}]') + exit(0) + +ds_loader.load(args.dataset_name) +ds_train, ds_test = ds_loader.prefetched_datasets() +nums_train_ds = ds_loader.get_num_train_examples() +nums_test_ds = ds_loader.get_num_test_examples() +print(f'class names : {ds_loader.class_names()}') +print(f'train dataset len : {nums_train_ds}') +print(f'test dataset len : {nums_test_ds}') + +################################################################################ +# Convert tensorlfow dataset to onert format +################################################################################ +Path(f'{args.out_dir}').mkdir(parents=True, exist_ok=True) +prefix_name = f'{args.out_dir}/{args.prefix_name}' +if args.prefix_name != '': + prefix_name += '.' + +nums_train = args.train_length +if (nums_train > nums_train_ds): + print( + f'Oops! The number of data for training in the dataset is less than {nums_train}') + exit(1) + +nums_test = args.test_length +if (nums_test > nums_test_ds): + print(f'Oops! The number of data for test in the dataset is less than {nums_test}') + exit(1) + + +def _only_image(image, _): + return image + + +def _only_label(_, label): + return label + + +def _label_to_array(label): + arr = np.zeros(ds_loader.num_classes(), dtype=float) + arr[label] = 1. + tensor = tf.convert_to_tensor(arr, tf.float32) + return tensor + + +file_path_list = [ + f'{prefix_name}train.input.{nums_train}.bin', + f'{prefix_name}test.input.{nums_test}.bin', + f'{prefix_name}train.output.{nums_train}.bin', + f'{prefix_name}test.output.{nums_test}.bin' +] + +ds_list = [ + ds_train.take(nums_train).map(_only_image), + ds_test.take(nums_test).map(_only_image), + [_label_to_array(label) for label in ds_train.take(nums_train).map(_only_label)], + [_label_to_array(label) for label in ds_test.take(nums_test).map(_only_label)] +] + +for i in range(4): + file_path = file_path_list[i] + with open(file_path, 'wb') as f: + ds = ds_list[i] + for tensor in ds: + f.write(tensor.numpy().tobytes()) + f.close() + +print('The data files are created!') diff --git a/tools/generate_datafile/tf_dataset_converter/requirements.txt b/tools/generate_datafile/tf_dataset_converter/requirements.txt new file mode 100644 index 000000000..c34025fe6 --- /dev/null +++ b/tools/generate_datafile/tf_dataset_converter/requirements.txt @@ -0,0 +1,8 @@ +argparse +numpy +pathlib +# Please upgrade pip version before installing these requerements. +# pip 20.2 and earlier doesn’t have true dependency resolution. +# Refer to https://pip.pypa.io/en/latest/user_guide/#requirements-files +tensorflow==2.8.2 # This version specifies the upper bound for protobuf +tensorflow_datasets==4.7.0 |