diff options
author | Sergey Karayev <sergeykarayev@gmail.com> | 2014-03-16 19:55:59 -0700 |
---|---|---|
committer | Sergey Karayev <sergeykarayev@gmail.com> | 2014-03-16 19:55:59 -0700 |
commit | 1fce1c424de0ef9dde54432e7053c9648e0bfa83 (patch) | |
tree | c8c15c9c4eb44e4fff32f4509ea20bc560eb8616 /src | |
parent | 6ec66f33fc45bbddc2cc04be5248384a25bf9f60 (diff) | |
download | caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.gz caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.bz2 caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.zip |
HDF5DataLayer source is now a list of filenames
Diffstat (limited to 'src')
-rw-r--r-- | src/caffe/layers/hdf5_data_layer.cpp | 38 | ||||
-rw-r--r-- | src/caffe/layers/hdf5_data_layer.cu | 10 | ||||
-rw-r--r-- | src/caffe/test/test_data/generate_sample_data.py | 30 | ||||
-rw-r--r-- | src/caffe/test/test_data/sample_data.h5 | bin | 10184 -> 10184 bytes | |||
-rw-r--r-- | src/caffe/test/test_data/sample_data_2_gzip.h5 | bin | 0 -> 9992 bytes | |||
-rw-r--r-- | src/caffe/test/test_data/sample_data_list.txt | 2 | ||||
-rw-r--r-- | src/caffe/test/test_hdf5data_layer.cpp | 95 |
7 files changed, 105 insertions, 70 deletions
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 5ac594b1..7f993a6c 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -4,11 +4,14 @@ Contributors: - Tobias Domhan, 2014. TODO: -- only load parts of the file, in accordance with a prototxt param "max_mem" +- load file in a separate thread ("prefetch") +- can be smarter about the memcpy call instead of doing it row-by-row */ #include <stdint.h> #include <string> #include <vector> +#include <iostream> +#include <fstream> #include "hdf5.h" #include "hdf5_hl.h" @@ -61,9 +64,23 @@ void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom, CHECK_EQ(bottom.size(), 0) << "HDF5DataLayer takes no input blobs."; CHECK_EQ(top->size(), 2) << "HDF5DataLayer takes two blobs as output."; - // Load the HDF5 file and initialize the counter. - const char* hdf_filename = this->layer_param_.source().c_str(); - load_hdf5_file(hdf_filename); + // Read the source to parse the filenames. + LOG(INFO) << "Loading filename from " << this->layer_param_.source(); + hdf_filenames_.clear(); + std::ifstream myfile(this->layer_param_.source().c_str()); + if (myfile.is_open()) { + string line = ""; + while (myfile >> line) { + hdf_filenames_.push_back(line); + } + } + myfile.close(); + num_files_ = hdf_filenames_.size(); + current_file_ = 0; + LOG(INFO) << "Number of files: " << num_files_; + + // Load the first HDF5 file and initialize the line counter. + load_hdf5_file(hdf_filenames_[current_file_].c_str()); current_row_ = 0; // Reshape blobs. @@ -83,10 +100,18 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const int data_count = (*top)[0]->count() / (*top)[0]->num(); const int label_data_count = (*top)[1]->count() / (*top)[1]->num(); - //TODO: consolidate into a single memcpy call - for (int i = 0; i < batchsize; ++i, ++current_row_) { if (current_row_ == data_dims_[0]) { + if (num_files_ > 1) { + current_file_ += 1; + + if (current_file_ == num_files_) { + current_file_ = 0; + LOG(INFO) << "looping around to first file"; + } + + load_hdf5_file(hdf_filenames_[current_file_].c_str()); + } current_row_ = 0; } @@ -100,7 +125,6 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, } } - // The backward operations are dummy - they do not carry any computation. template <typename Dtype> Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu index 773ebc39..7a31a603 100644 --- a/src/caffe/layers/hdf5_data_layer.cu +++ b/src/caffe/layers/hdf5_data_layer.cu @@ -28,6 +28,16 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, for (int i = 0; i < batchsize; ++i, ++current_row_) { if (current_row_ == data_dims_[0]) { + if (num_files_ > 1) { + current_file_ += 1; + + if (current_file_ == num_files_) { + current_file_ = 0; + LOG(INFO) << "looping around to first file"; + } + + load_hdf5_file(hdf_filenames_[current_file_].c_str()); + } current_row_ = 0; } diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py index ab55305b..0d8f5aa9 100644 --- a/src/caffe/test/test_data/generate_sample_data.py +++ b/src/caffe/test/test_data/generate_sample_data.py @@ -1,7 +1,7 @@ """ Generate data used in the HDF5DataLayer test. """ - +import os import numpy as np import h5py @@ -9,11 +9,31 @@ num_cols = 8 num_rows = 10 height = 5 width = 5 -data = np.arange(num_cols * num_rows * height * width).reshape(num_rows, num_cols, height, width) +total_size = num_cols * num_rows * height * width + +data = np.arange(total_size) +data = data.reshape(num_rows, num_cols, height, width) +data = data.astype('float32') label = np.arange(num_rows)[:, np.newaxis] +label = label.astype('float32') + print data print label -with h5py.File('./sample_data.h5', 'w') as f: - f['data'] = data.astype('float32') - f['label'] = label.astype('float32') +with h5py.File(os.path.dirname(__file__) + '/sample_data.h5', 'w') as f: + f['data'] = data + f['label'] = label + +with h5py.File(os.path.dirname(__file__) + '/sample_data_2_gzip.h5', 'w') as f: + f.create_dataset( + 'data', data=data + total_size, + compression='gzip', compression_opts=1 + ) + f.create_dataset( + 'label', data=label, + compression='gzip', compression_opts=1 + ) + +with open(os.path.dirname(__file__) + '/sample_data_list.txt', 'w') as f: + f.write(os.path.dirname(__file__) + '/sample_data.h5\n') + f.write(os.path.dirname(__file__) + '/sample_data_2_gzip.h5\n') diff --git a/src/caffe/test/test_data/sample_data.h5 b/src/caffe/test/test_data/sample_data.h5 Binary files differindex db245bac..a1f923a7 100644 --- a/src/caffe/test/test_data/sample_data.h5 +++ b/src/caffe/test/test_data/sample_data.h5 diff --git a/src/caffe/test/test_data/sample_data_2_gzip.h5 b/src/caffe/test/test_data/sample_data_2_gzip.h5 Binary files differnew file mode 100644 index 00000000..56c0a740 --- /dev/null +++ b/src/caffe/test/test_data/sample_data_2_gzip.h5 diff --git a/src/caffe/test/test_data/sample_data_list.txt b/src/caffe/test/test_data/sample_data_list.txt new file mode 100644 index 00000000..cdf343fc --- /dev/null +++ b/src/caffe/test/test_data/sample_data_list.txt @@ -0,0 +1,2 @@ +src/caffe/test/test_data/sample_data.h5 +src/caffe/test/test_data/sample_data_2_gzip.h5 diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp index d4f3c4a7..59aee0c5 100644 --- a/src/caffe/test/test_hdf5data_layer.cpp +++ b/src/caffe/test/test_hdf5data_layer.cpp @@ -31,10 +31,8 @@ class HDF5DataLayerTest : public ::testing::Test { blob_top_vec_.push_back(blob_top_data_); blob_top_vec_.push_back(blob_top_label_); - // TODO: generate sample HDF5 file on the fly. - // For now, use example HDF5 file. - // TODO: how to best deal with the relativeness of the path? - filename = new string("src/caffe/test/test_data/sample_data.h5"); + // Check out generate_sample_data.py in the same directory. + filename = new string("src/caffe/test/test_data/sample_data_list.txt"); LOG(INFO) << "Using sample HDF5 data file " << filename; } @@ -80,62 +78,43 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) { EXPECT_EQ(this->blob_top_label_->height(), 1); EXPECT_EQ(this->blob_top_label_->width(), 1); - const int data_size = num_cols * height * width; - - // Go through the data 100 times. - for (int iter = 0; iter < 100; ++iter) { - layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_); - - // On even iterations, we're reading the first half of the data. - // On odd iterations, we're reading the second half of the data. - int label_offset = (iter % 2 == 0) ? 0 : batchsize; - int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size; - - for (int i = 0; i < batchsize; ++i) { - EXPECT_EQ( - label_offset + i, - this->blob_top_label_->cpu_data()[i]); - } - for (int i = 0; i < batchsize; ++i) { - for (int j = 0; j < num_cols; ++j) { - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - int idx = i * num_cols * height * width + j * height * width + h * width + w; - EXPECT_EQ( - data_offset + idx, - this->blob_top_data_->cpu_data()[idx]) - << "debug: i " << i << " j " << j; - } - } - } + for (int t=0; t<2; ++t) { + if (t == 0) { + Caffe::set_mode(Caffe::CPU); + } else { + Caffe::set_mode(Caffe::GPU); } - } - // Exact same test in GPU mode. - Caffe::set_mode(Caffe::GPU); - // Go through the data 100 times. - for (int iter = 0; iter < 100; ++iter) { - layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_); - - // On even iterations, we're reading the first half of the data. - // On odd iterations, we're reading the second half of the data. - int label_offset = (iter % 2 == 0) ? 0 : batchsize; - int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size; - - for (int i = 0; i < batchsize; ++i) { - EXPECT_EQ( - label_offset + i, - this->blob_top_label_->cpu_data()[i]); - } - for (int i = 0; i < batchsize; ++i) { - for (int j = 0; j < num_cols; ++j) { - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - int idx = i * num_cols * height * width + j * height * width + h * width + w; - EXPECT_EQ( - data_offset + idx, - this->blob_top_data_->cpu_data()[idx]) - << "debug: i " << i << " j " << j; + // Go through the data 100 times (50 batches). + const int data_size = num_cols * height * width; + for (int iter = 0; iter < 100; ++iter) { + layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_); + + // On even iterations, we're reading the first half of the data. + // On odd iterations, we're reading the second half of the data. + int label_offset = (iter % 2 == 0) ? 0 : batchsize; + int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size; + + // Every two iterations we are reading the second file, + // which has the same labels, but data is offset by total data size, + // which is 2000 (see generate_sample_data). + int file_offset = (iter % 4 < 2) ? 0 : 2000; + + for (int i = 0; i < batchsize; ++i) { + EXPECT_EQ( + label_offset + i, + this->blob_top_label_->cpu_data()[i]); + } + for (int i = 0; i < batchsize; ++i) { + for (int j = 0; j < num_cols; ++j) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + int idx = i * num_cols * height * width + j * height * width + h * width + w; + EXPECT_EQ( + file_offset + data_offset + idx, + this->blob_top_data_->cpu_data()[idx]) + << "debug: i " << i << " j " << j << " iter " << iter; + } } } } |