HDF5DataLayer source is now a list of filenames

author: Sergey Karayev <sergeykarayev@gmail.com> 2014-03-16 19:55:59 -0700
committer: Sergey Karayev <sergeykarayev@gmail.com> 2014-03-16 19:55:59 -0700
commit: 1fce1c424de0ef9dde54432e7053c9648e0bfa83 (patch)
tree: c8c15c9c4eb44e4fff32f4509ea20bc560eb8616 /src
parent: 6ec66f33fc45bbddc2cc04be5248384a25bf9f60 (diff)
download: caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.gz
caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.bz2
caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.zip
7 files changed, 105 insertions, 70 deletions
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 5ac594b1..7f993a6c 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -4,11 +4,14 @@ Contributors:
 - Tobias Domhan, 2014.
 
 TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
+- load file in a separate thread ("prefetch")
+- can be smarter about the memcpy call instead of doing it row-by-row
 */
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include <iostream>
+#include <fstream>
 
 #include "hdf5.h"
 #include "hdf5_hl.h"
@@ -61,9 +64,23 @@ void HDF5DataLayer<Dtype>::SetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_EQ(bottom.size(), 0) << "HDF5DataLayer takes no input blobs.";
   CHECK_EQ(top->size(), 2) << "HDF5DataLayer takes two blobs as output.";
 
-  // Load the HDF5 file and initialize the counter.
-  const char* hdf_filename = this->layer_param_.source().c_str();
-  load_hdf5_file(hdf_filename);
+  // Read the source to parse the filenames.
+  LOG(INFO) << "Loading filename from " << this->layer_param_.source();
+  hdf_filenames_.clear();
+  std::ifstream myfile(this->layer_param_.source().c_str());
+  if (myfile.is_open()) {
+    string line = "";
+    while (myfile >> line) {
+      hdf_filenames_.push_back(line);
+    }
+  }
+  myfile.close();
+  num_files_ = hdf_filenames_.size();
+  current_file_ = 0;
+  LOG(INFO) << "Number of files: " << num_files_;
+
+  // Load the first HDF5 file and initialize the line counter.
+  load_hdf5_file(hdf_filenames_[current_file_].c_str());
   current_row_ = 0;
 
   // Reshape blobs.
@@ -83,10 +100,18 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int data_count = (*top)[0]->count() / (*top)[0]->num();
   const int label_data_count = (*top)[1]->count() / (*top)[1]->num();
 
-  //TODO: consolidate into a single memcpy call
-
   for (int i = 0; i < batchsize; ++i, ++current_row_) {
     if (current_row_ == data_dims_[0]) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          LOG(INFO) << "looping around to first file";
+        }
+
+        load_hdf5_file(hdf_filenames_[current_file_].c_str());
+      }
       current_row_ = 0;
     }
 
@@ -100,7 +125,6 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-
 // The backward operations are dummy - they do not carry any computation.
 template <typename Dtype>
 Dtype HDF5DataLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 773ebc39..7a31a603 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -28,6 +28,16 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   for (int i = 0; i < batchsize; ++i, ++current_row_) {
     if (current_row_ == data_dims_[0]) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          LOG(INFO) << "looping around to first file";
+        }
+
+        load_hdf5_file(hdf_filenames_[current_file_].c_str());
+      }
       current_row_ = 0;
     }
 
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
index ab55305b..0d8f5aa9 100644
--- a/src/caffe/test/test_data/generate_sample_data.py
+++ b/src/caffe/test/test_data/generate_sample_data.py
@@ -1,7 +1,7 @@
 """
 Generate data used in the HDF5DataLayer test.
 """
-
+import os
 import numpy as np
 import h5py
 
@@ -9,11 +9,31 @@ num_cols = 8
 num_rows = 10
 height = 5
 width = 5
-data = np.arange(num_cols * num_rows * height * width).reshape(num_rows, num_cols, height, width)
+total_size = num_cols * num_rows * height * width
+
+data = np.arange(total_size)
+data = data.reshape(num_rows, num_cols, height, width)
+data = data.astype('float32')
 label = np.arange(num_rows)[:, np.newaxis]
+label = label.astype('float32')
+
 print data
 print label
 
-with h5py.File('./sample_data.h5', 'w') as f:
-    f['data'] = data.astype('float32')
-    f['label'] = label.astype('float32')
+with h5py.File(os.path.dirname(__file__) + '/sample_data.h5', 'w') as f:
+    f['data'] = data
+    f['label'] = label
+
+with h5py.File(os.path.dirname(__file__) + '/sample_data_2_gzip.h5', 'w') as f:
+    f.create_dataset(
+        'data', data=data + total_size,
+        compression='gzip', compression_opts=1
+    )
+    f.create_dataset(
+        'label', data=label,
+        compression='gzip', compression_opts=1
+    )
+
+with open(os.path.dirname(__file__) + '/sample_data_list.txt', 'w') as f:
+    f.write(os.path.dirname(__file__) + '/sample_data.h5\n')
+    f.write(os.path.dirname(__file__) + '/sample_data_2_gzip.h5\n')
diff --git a/src/caffe/test/test_data/sample_data.h5 b/src/caffe/test/test_data/sample_data.h5
index db245bac..a1f923a7 100644
--- a/src/caffe/test/test_data/sample_data.h5
+++ b/src/caffe/test/test_data/sample_data.h5
diff --git a/src/caffe/test/test_data/sample_data_2_gzip.h5 b/src/caffe/test/test_data/sample_data_2_gzip.h5
new file mode 100644
index 00000000..56c0a740
--- /dev/null
+++ b/src/caffe/test/test_data/sample_data_2_gzip.h5
diff --git a/src/caffe/test/test_data/sample_data_list.txt b/src/caffe/test/test_data/sample_data_list.txt
new file mode 100644
index 00000000..cdf343fc
--- /dev/null
+++ b/src/caffe/test/test_data/sample_data_list.txt
@@ -0,0 +1,2 @@
+src/caffe/test/test_data/sample_data.h5
+src/caffe/test/test_data/sample_data_2_gzip.h5
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index d4f3c4a7..59aee0c5 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -31,10 +31,8 @@ class HDF5DataLayerTest : public ::testing::Test {
     blob_top_vec_.push_back(blob_top_data_);
     blob_top_vec_.push_back(blob_top_label_);
 
-    // TODO: generate sample HDF5 file on the fly.
-    // For now, use example HDF5 file.
-    // TODO: how to best deal with the relativeness of the path?
-    filename = new string("src/caffe/test/test_data/sample_data.h5");
+    // Check out generate_sample_data.py in the same directory.
+    filename = new string("src/caffe/test/test_data/sample_data_list.txt");
     LOG(INFO) << "Using sample HDF5 data file " << filename;
   }
 
@@ -80,62 +78,43 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   EXPECT_EQ(this->blob_top_label_->height(), 1);
   EXPECT_EQ(this->blob_top_label_->width(), 1);
 
-  const int data_size = num_cols * height * width;
-
-  // Go through the data 100 times.
-  for (int iter = 0; iter < 100; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
-
-    // On even iterations, we're reading the first half of the data.
-    // On odd iterations, we're reading the second half of the data.
-    int label_offset = (iter % 2 == 0) ? 0 : batchsize;
-    int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
-
-    for (int i = 0; i < batchsize; ++i) {
-      EXPECT_EQ(
-        label_offset + i,
-        this->blob_top_label_->cpu_data()[i]);
-    }
-    for (int i = 0; i < batchsize; ++i) {
-      for (int j = 0; j < num_cols; ++j) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            int idx = i * num_cols * height * width + j * height * width + h * width + w;
-            EXPECT_EQ(
-              data_offset + idx,
-              this->blob_top_data_->cpu_data()[idx])
-              << "debug: i " << i << " j " << j;
-          }
-        }
-      }
+  for (int t=0; t<2; ++t) {
+    if (t == 0) {
+      Caffe::set_mode(Caffe::CPU);
+    } else {
+      Caffe::set_mode(Caffe::GPU);
     }
-  }
 
-  // Exact same test in GPU mode.
-  Caffe::set_mode(Caffe::GPU);
-  // Go through the data 100 times.
-  for (int iter = 0; iter < 100; ++iter) {
-    layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
-
-    // On even iterations, we're reading the first half of the data.
-    // On odd iterations, we're reading the second half of the data.
-    int label_offset = (iter % 2 == 0) ? 0 : batchsize;
-    int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
-
-    for (int i = 0; i < batchsize; ++i) {
-      EXPECT_EQ(
-        label_offset + i,
-        this->blob_top_label_->cpu_data()[i]);
-    }
-    for (int i = 0; i < batchsize; ++i) {
-      for (int j = 0; j < num_cols; ++j) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            int idx = i * num_cols * height * width + j * height * width + h * width + w;
-            EXPECT_EQ(
-              data_offset + idx,
-              this->blob_top_data_->cpu_data()[idx])
-              << "debug: i " << i << " j " << j;
+    // Go through the data 100 times (50 batches).
+    const int data_size = num_cols * height * width;
+    for (int iter = 0; iter < 100; ++iter) {
+      layer.Forward(this->blob_bottom_vec_, &this->blob_top_vec_);
+
+      // On even iterations, we're reading the first half of the data.
+      // On odd iterations, we're reading the second half of the data.
+      int label_offset = (iter % 2 == 0) ? 0 : batchsize;
+      int data_offset = (iter % 2 == 0) ? 0 : batchsize * data_size;
+
+      // Every two iterations we are reading the second file,
+      // which has the same labels, but data is offset by total data size,
+      // which is 2000 (see generate_sample_data).
+      int file_offset = (iter % 4 < 2) ? 0 : 2000;
+
+      for (int i = 0; i < batchsize; ++i) {
+        EXPECT_EQ(
+          label_offset + i,
+          this->blob_top_label_->cpu_data()[i]);
+      }
+      for (int i = 0; i < batchsize; ++i) {
+        for (int j = 0; j < num_cols; ++j) {
+          for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+              int idx = i * num_cols * height * width + j * height * width + h * width + w;
+              EXPECT_EQ(
+                file_offset + data_offset + idx,
+                this->blob_top_data_->cpu_data()[idx])
+                << "debug: i " << i << " j " << j << " iter " << iter;
+            }
           }
         }
       }
author	Sergey Karayev <sergeykarayev@gmail.com>	2014-03-16 19:55:59 -0700
committer	Sergey Karayev <sergeykarayev@gmail.com>	2014-03-16 19:55:59 -0700
commit	1fce1c424de0ef9dde54432e7053c9648e0bfa83 (patch)
tree	c8c15c9c4eb44e4fff32f4509ea20bc560eb8616 /src
parent	6ec66f33fc45bbddc2cc04be5248384a25bf9f60 (diff)
download	caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.gz caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.tar.bz2 caffeonacl-1fce1c424de0ef9dde54432e7053c9648e0bfa83.zip