summaryrefslogtreecommitdiff
path: root/matlab
diff options
context:
space:
mode:
authorDinesh Jayaraman <dineshj@cs.utexas.edu>2015-01-17 18:23:48 -0600
committerDinesh Jayaraman <dineshj@cs.utexas.edu>2015-01-17 18:23:48 -0600
commitc795095fd26767152e3ee6a3183cda5edd75c39b (patch)
treee8a93a57ba5d54a54197d08dbd657cb0d74d13f7 /matlab
parentf476e9014830a502b0b01e18b3936021f7b2fb33 (diff)
downloadcaffeonacl-c795095fd26767152e3ee6a3183cda5edd75c39b.tar.gz
caffeonacl-c795095fd26767152e3ee6a3183cda5edd75c39b.tar.bz2
caffeonacl-c795095fd26767152e3ee6a3183cda5edd75c39b.zip
Matlab demo for Caffe-compatible HDF5 read/write
Diffstat (limited to 'matlab')
-rw-r--r--matlab/caffe/hdf5creation/.gitignore2
-rw-r--r--matlab/caffe/hdf5creation/demo.m64
-rw-r--r--matlab/caffe/hdf5creation/store2hdf5.m59
3 files changed, 125 insertions, 0 deletions
diff --git a/matlab/caffe/hdf5creation/.gitignore b/matlab/caffe/hdf5creation/.gitignore
new file mode 100644
index 00000000..e2333dd1
--- /dev/null
+++ b/matlab/caffe/hdf5creation/.gitignore
@@ -0,0 +1,2 @@
+*.h5
+list.txt
diff --git a/matlab/caffe/hdf5creation/demo.m b/matlab/caffe/hdf5creation/demo.m
new file mode 100644
index 00000000..f554b87e
--- /dev/null
+++ b/matlab/caffe/hdf5creation/demo.m
@@ -0,0 +1,64 @@
+%% WRITING TO HDF5
+filename='trial.h5';
+
+num_total_samples=10000;
+% to simulate data being read from disk / generated etc.
+data_disk=rand(5,5,1,num_total_samples);
+label_disk=rand(10,num_total_samples);
+
+chunksz=100;
+created_flag=false;
+totalct=0;
+for batchno=1:num_total_samples/chunksz
+ fprintf('batch no. %d\n', batchno);
+ last_read=(batchno-1)*chunksz;
+
+ % to simulate maximum data to be held in memory before dumping to hdf5 file
+ batchdata=data_disk(:,:,1,last_read+1:last_read+chunksz);
+ batchlabs=label_disk(:,last_read+1:last_read+chunksz);
+
+ % store to hdf5
+ startloc=struct('dat',[1,1,1,totalct+1], 'lab', [1,totalct+1]);
+ curr_dat_sz=store2hdf5(filename, batchdata, batchlabs, ~created_flag, startloc, chunksz);
+ created_flag=true;% flag set so that file is created only once
+ totalct=curr_dat_sz(end);% updated dataset size (#samples)
+end
+
+% display structure of the stored HDF5 file
+h5disp(filename);
+
+%% READING FROM HDF5
+
+% Read data and labels for samples #1000 to 1999
+data_rd=h5read(filename, '/data', [1 1 1 1000], [5, 5, 1, 1000]);
+label_rd=h5read(filename, '/label', [1 1000], [10, 1000]);
+fprintf('Testing ...\n');
+try
+ assert(isequal(data_rd, single(data_disk(:,:,:,1000:1999))), 'Data do not match');
+ assert(isequal(label_rd, single(label_disk(:,1000:1999))), 'Labels do not match');
+
+ fprintf('Success!\n');
+catch err
+ fprintf('Test failed ...\n');
+ getReport(err)
+end
+
+%delete(filename);
+
+% CREATE list.txt containing filename, to be used as source for HDF5_DATA_LAYER
+FILE=fopen('list.txt', 'w');
+fprintf(FILE, '%s', filename);
+fclose(FILE);
+fprintf('HDF5 filename listed in %s \n', 'list.txt');
+
+% NOTE: In net definition prototxt, use list.txt as input to HDF5_DATA as:
+% layers {
+% name: "data"
+% type: HDF5_DATA
+% top: "data"
+% top: "labelvec"
+% hdf5_data_param {
+% source: "/path/to/list.txt"
+% batch_size: 64
+% }
+% }
diff --git a/matlab/caffe/hdf5creation/store2hdf5.m b/matlab/caffe/hdf5creation/store2hdf5.m
new file mode 100644
index 00000000..0a0016dc
--- /dev/null
+++ b/matlab/caffe/hdf5creation/store2hdf5.m
@@ -0,0 +1,59 @@
+function [curr_dat_sz, curr_lab_sz] = store2hdf5(filename, data, labels, create, startloc, chunksz)
+ % *data* is W*H*C*N matrix of images should be normalized (e.g. to lie between 0 and 1) beforehand
+ % *label* is D*N matrix of labels (D labels per sample)
+ % *create* [0/1] specifies whether to create file newly or to append to previously created file, useful to store information in batches when a dataset is too big to be held in memory (default: 1)
+ % *startloc* (point at which to start writing data). By default,
+ % if create=1 (create mode), startloc.data=[1 1 1 1], and startloc.lab=[1 1];
+ % if create=0 (append mode), startloc.data=[1 1 1 K+1], and startloc.lab = [1 K+1]; where K is the current number of samples stored in the HDF
+ % chunksz (used only in create mode), specifies number of samples to be stored per chunk (see HDF5 documentation on chunking) for creating HDF5 files with unbounded maximum size - TLDR; higher chunk sizes allow faster read-write operations
+
+ % verify that format is right
+ dat_dims=size(data);
+ lab_dims=size(labels);
+ num_samples=dat_dims(end);
+
+ assert(lab_dims(end)==num_samples, 'Number of samples should be matched between data and labels');
+
+ if ~exist('create','var')
+ create=true;
+ end
+
+
+ if create
+ %fprintf('Creating dataset with %d samples\n', num_samples);
+ if ~exist('chunksz', 'var')
+ chunksz=1000;
+ end
+ if exist(filename, 'file')
+ fprintf('Warning: replacing existing file %s \n', filename);
+ delete(filename);
+ end
+ h5create(filename, '/data', [dat_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [dat_dims(1:end-1) chunksz]); % width, height, channels, number
+ h5create(filename, '/label', [lab_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [lab_dims(1:end-1) chunksz]); % width, height, channels, number
+ if ~exist('startloc','var')
+ startloc.dat=[ones(1,length(dat_dims)-1), 1];
+ startloc.lab=[ones(1,length(lab_dims)-1), 1];
+ end
+ else % append mode
+ if ~exist('startloc','var')
+ info=h5info(filename);
+ prev_dat_sz=info.Datasets(1).Dataspace.Size;
+ prev_lab_sz=info.Datasets(2).Dataspace.Size;
+ assert(prev_dat_sz(1:end-1)==dat_dims(1:end-1), 'Data dimensions must match existing dimensions in dataset');
+ assert(prev_lab_sz(1:end-1)==lab_dims(1:end-1), 'Label dimensions must match existing dimensions in dataset');
+ startloc.dat=[ones(1,length(dat_dims)-1), prev_dat_sz(end)+1];
+ startloc.lab=[ones(1,length(lab_dims)-1), prev_lab_sz(end)+1];
+ end
+ end
+
+ if ~isempty(data)
+ h5write(filename, '/data', single(data), startloc.dat, size(data));
+ h5write(filename, '/label', single(labels), startloc.lab, size(labels));
+ end
+
+ if nargout
+ info=h5info(filename);
+ curr_dat_sz=info.Datasets(1).Dataspace.Size;
+ curr_lab_sz=info.Datasets(2).Dataspace.Size;
+ end
+end