summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSergey Karayev <sergeykarayev@gmail.com>2014-03-16 19:55:59 -0700
committerSergey Karayev <sergeykarayev@gmail.com>2014-03-16 19:55:59 -0700
commit1fce1c424de0ef9dde54432e7053c9648e0bfa83 (patch)
treec8c15c9c4eb44e4fff32f4509ea20bc560eb8616 /src
parent6ec66f33fc45bbddc2cc04be5248384a25bf9f60 (diff)
downloadcaffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.gz
caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.bz2
caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.zip
HDF5DataLayer source is now a list of filenames
Diffstat (limited to 'src')
-rw-r--r--src/caffe/layers/hdf5_data_layer.cpp38
-rw-r--r--src/caffe/layers/hdf5_data_layer.cu10
-rw-r--r--src/caffe/test/test_data/generate_sample_data.py30
-rw-r--r--src/caffe/test/test_data/sample_data.h5bin10184 -> 10184 bytes
-rw-r--r--src/caffe/test/test_data/sample_data_2_gzip.h5bin0 -> 9992 bytes
-rw-r--r--src/caffe/test/test_data/sample_data_list.txt2
-rw-r--r--src/caffe/test/test_hdf5data_layer.cpp95
7 files changed, 105 insertions, 70 deletions
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 5ac594b1..7f993a6c 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -4,11 +4,14 @@ Contributors:
- Tobias Domhan, 2014.
TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
+- load file in a separate thread ("prefetch")
+- can be smarter about the memcpy call instead of doing it row-by-row
*/
#include <stdint.h>
#include <string>
#include <vector>
+#include <iostream>
+#include <fstream>
#include "hdf5.h"
#include "hdf5_hl.h"
@@ -61,9 +64,23 @@ void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
CHECK_EQ(bottom.size(), 0) << "HDF5DataLayer takes no input blobs.";
CHECK_EQ(top->size(), 2) << "HDF5DataLayer takes two blobs as output.";
- // Load the HDF5 file and initialize the counter.
- const char* hdf_filename = this->layer_param_.source().c_str();
- load_hdf5_file(hdf_filename);
+ // Read the source to parse the filenames.
+ LOG(INFO) << "Loading filename from " << this->layer_param_.source();
+ hdf_filenames_.clear();
+ std::ifstream myfile(this->layer_param_.source().c_str());
+ if (myfile.is_open()) {
+ string line = "";
+ while (myfile >> line) {
+ hdf_filenames_.push_back(line);
+ }
+ }
+ myfile.close();
+ num_files_ = hdf_filenames_.size();
+ current_file_ = 0;
+ LOG(INFO) << "Number of files: " << num_files_;
+
+ // Load the first HDF5 file and initialize the line counter.
+ load_hdf5_file(hdf_filenames_[current_file_].c_str());
current_row_ = 0;
// Reshape blobs.
@@ -83,10 +100,18 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const int data_count = (*top)[0]->count() / (*top)[0]->num();
const int label_data_count = (*top)[1]->count() / (*top)[1]->num();
- //TODO: consolidate into a single memcpy call
-
for (int i = 0; i < batchsize; ++i, ++current_row_) {
if (current_row_ == data_dims_[0]) {
+ if (num_files_ > 1) {
+ current_file_ += 1;
+
+ if (current_file_ == num_files_) {
+ current_file_ = 0;
+ LOG(INFO) << "looping around to first file";
+ }
+
+ load_hdf5_file(hdf_filenames_[current_file_].c_str());
+ }
current_row_ = 0;
}
@@ -100,7 +125,6 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
}
}
-
// The backward operations are dummy - they do not carry any computation.
template <typename Dtype>
Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 773ebc39..7a31a603 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -28,6 +28,16 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
for (int i = 0; i < batchsize; ++i, ++current_row_) {
if (current_row_ == data_dims_[0]) {
+ if (num_files_ > 1) {
+ current_file_ += 1;
+
+ if (current_file_ == num_files_) {
+ current_file_ = 0;
+ LOG(INFO) << "looping around to first file";
+ }
+
+ load_hdf5_file(hdf_filenames_[current_file_].c_str());
+ }
current_row_ = 0;
}
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
index ab55305b..0d8f5aa9 100644
--- a/src/caffe/test/test_data/generate_sample_data.py
+++ b/src/caffe/test/test_data/generate_sample_data.py
@@ -1,7 +1,7 @@
"""
Generate data used in the HDF5DataLayer test.
"""
-
+import os
import numpy as np
import h5py
@@ -9,11 +9,31 @@ num_cols = 8
num_rows = 10
height = 5
width = 5
-data = np.arange(num_cols * num_rows * height * width).reshape(num_rows, num_cols, height, width)
+total_size = num_cols * num_rows * height * width
+
+data = np.arange(total_size)
+data = data.reshape(num_rows, num_cols, height, width)
+data = data.astype('float32')
label = np.arange(num_rows)[:, np.newaxis]
+label = label.astype('float32')
+
print data
print label
-with h5py.File('./sample_data.h5', 'w') as f:
- f['data'] = data.astype('float32')
- f['label'] = label.astype('float32')
+with h5py.File(os.path.dirname(__file__) + '/sample_data.h5', 'w') as f:
+ f['data'] = data
+ f['label'] = label
+
+with h5py.File(os.path.dirname(__file__) + '/sample_data_2_gzip.h5', 'w') as f:
+ f.create_dataset(
+ 'data', data=data + total_size,
+ compression='gzip', compression_opts=1
+ )
+ f.create_dataset(
+ 'label', data=label,
+ compression='gzip', compression_opts=1
+ )
+
+with open(os.path.dirname(__file__) + '/sample_data_list.txt', 'w') as f:
+ f.write(os.path.dirname(__file__) + '/sample_data.h5\n')
+ f.write(os.path.dirname(__file__) + '/sample_data_2_gzip.h5\n')
diff --git a/src/caffe/test/test_data/sample_data.h5 b/src/caffe/test/test_data/sample_data.h5
index db245bac..a1f923a7 100644
--- a/src/caffe/test/test_data/sample_data.h5
+++ b/src/caffe/test/test_data/sample_data.h5
Binary files differ
diff --git a/src/caffe/test/test_data/sample_data_2_gzip.h5 b/src/caffe/test/test_data/sample_data_2_gzip.h5
new file mode 100644
index 00000000..56c0a740
--- /dev/null
+++ b/src/caffe/test/test_data/sample_data_2_gzip.h5
Binary files differ
diff --git a/src/caffe/test/test_data/sample_data_list.txt b/src/caffe/test/test_data/sample_data_list.txt
new file mode 100644
index 00000000..cdf343fc
--- /dev/null
+++ b/src/caffe/test/test_data/sample_data_list.txt
@@ -0,0 +1,2 @@
+src/caffe/test/test_data/sample_data.h5
+src/caffe/test/test_data/sample_data_2_gzip.h5
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index d4f3c4a7..59aee0c5 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -31,10 +31,8 @@ class HDF5DataLayerTest : public ::testing::Test {
blob_top_vec_.push_back(blob_top_data_);
blob_top_vec_.push_back(blob_top_label_);
- // TODO: generate sample HDF5 file on the fly.
- // For now, use example HDF5 file.
- // TODO: how to best deal with the relativeness of the path?
- filename = new string("src/caffe/test/test_data/sample_data.h5");
+ // Check out generate_sample_data.py in the same directory.
+ filename = new string("src/caffe/test/test_data/sample_data_list.txt");
LOG(INFO) << "Using sample HDF5 data file " << filename;
}
@@ -80,62 +78,43 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
EXPECT_EQ(this->blob_top_label_->height(), 1);
EXPECT_EQ(this->blob_top_label_->width(), 1);
- const int data_size = num_cols * height * width;
-
- // Go through the data 100 times.
- for (int iter = 0; iter < 100; ++iter) {
- layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
-
- // On even iterations, we're reading the first half of the data.
- // On odd iterations, we're reading the second half of the data.
- int label_offset = (iter % 2 == 0) ? 0 : batchsize;
- int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
-
- for (int i = 0; i < batchsize; ++i) {
- EXPECT_EQ(
- label_offset + i,
- this->blob_top_label_->cpu_data()[i]);
- }
- for (int i = 0; i < batchsize; ++i) {
- for (int j = 0; j < num_cols; ++j) {
- for (int h = 0; h < height; ++h) {
- for (int w = 0; w < width; ++w) {
- int idx = i * num_cols * height * width + j * height * width + h * width + w;
- EXPECT_EQ(
- data_offset + idx,
- this->blob_top_data_->cpu_data()[idx])
- << "debug: i " << i << " j " << j;
- }
- }
- }
+ for (int t=0; t<2; ++t) {
+ if (t == 0) {
+ Caffe::set_mode(Caffe::CPU);
+ } else {
+ Caffe::set_mode(Caffe::GPU);
}
- }
- // Exact same test in GPU mode.
- Caffe::set_mode(Caffe::GPU);
- // Go through the data 100 times.
- for (int iter = 0; iter < 100; ++iter) {
- layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
-
- // On even iterations, we're reading the first half of the data.
- // On odd iterations, we're reading the second half of the data.
- int label_offset = (iter % 2 == 0) ? 0 : batchsize;
- int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
-
- for (int i = 0; i < batchsize; ++i) {
- EXPECT_EQ(
- label_offset + i,
- this->blob_top_label_->cpu_data()[i]);
- }
- for (int i = 0; i < batchsize; ++i) {
- for (int j = 0; j < num_cols; ++j) {
- for (int h = 0; h < height; ++h) {
- for (int w = 0; w < width; ++w) {
- int idx = i * num_cols * height * width + j * height * width + h * width + w;
- EXPECT_EQ(
- data_offset + idx,
- this->blob_top_data_->cpu_data()[idx])
- << "debug: i " << i << " j " << j;
+ // Go through the data 100 times (50 batches).
+ const int data_size = num_cols * height * width;
+ for (int iter = 0; iter < 100; ++iter) {
+ layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
+
+ // On even iterations, we're reading the first half of the data.
+ // On odd iterations, we're reading the second half of the data.
+ int label_offset = (iter % 2 == 0) ? 0 : batchsize;
+ int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
+
+ // Every two iterations we are reading the second file,
+ // which has the same labels, but data is offset by total data size,
+ // which is 2000 (see generate_sample_data).
+ int file_offset = (iter % 4 < 2) ? 0 : 2000;
+
+ for (int i = 0; i < batchsize; ++i) {
+ EXPECT_EQ(
+ label_offset + i,
+ this->blob_top_label_->cpu_data()[i]);
+ }
+ for (int i = 0; i < batchsize; ++i) {
+ for (int j = 0; j < num_cols; ++j) {
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ int idx = i * num_cols * height * width + j * height * width + h * width + w;
+ EXPECT_EQ(
+ file_offset + data_offset + idx,
+ this->blob_top_data_->cpu_data()[idx])
+ << "debug: i " << i << " j " << j << " iter " << iter;
+ }
}
}
}