diff options
author | Evan Shelhamer <shelhamer@imaginarynumber.net> | 2015-01-25 23:06:57 -0800 |
---|---|---|
committer | Evan Shelhamer <shelhamer@imaginarynumber.net> | 2015-01-25 23:06:57 -0800 |
commit | 85a7626af342ece3720fe735ffa281f99c19d4da (patch) | |
tree | b4d358107d6760cd65bd6364ceaf04f7a1afa399 /matlab | |
parent | eb107449c46e92be2045d4b84ac06a77bab0ce23 (diff) | |
parent | c795095fd26767152e3ee6a3183cda5edd75c39b (diff) | |
download | caffeonacl-85a7626af342ece3720fe735ffa281f99c19d4da.tar.gz caffeonacl-85a7626af342ece3720fe735ffa281f99c19d4da.tar.bz2 caffeonacl-85a7626af342ece3720fe735ffa281f99c19d4da.zip |
Merge pull request #1746 from dj1989/mat_hdf5_demo
Matlab demo for Caffe-compatible HDF5 read/write
Diffstat (limited to 'matlab')
-rw-r--r-- | matlab/caffe/hdf5creation/.gitignore | 2 | ||||
-rw-r--r-- | matlab/caffe/hdf5creation/demo.m | 64 | ||||
-rw-r--r-- | matlab/caffe/hdf5creation/store2hdf5.m | 59 |
3 files changed, 125 insertions, 0 deletions
diff --git a/matlab/caffe/hdf5creation/.gitignore b/matlab/caffe/hdf5creation/.gitignore new file mode 100644 index 00000000..e2333dd1 --- /dev/null +++ b/matlab/caffe/hdf5creation/.gitignore @@ -0,0 +1,2 @@ +*.h5 +list.txt diff --git a/matlab/caffe/hdf5creation/demo.m b/matlab/caffe/hdf5creation/demo.m new file mode 100644 index 00000000..f554b87e --- /dev/null +++ b/matlab/caffe/hdf5creation/demo.m @@ -0,0 +1,64 @@ +%% WRITING TO HDF5 +filename='trial.h5'; + +num_total_samples=10000; +% to simulate data being read from disk / generated etc. +data_disk=rand(5,5,1,num_total_samples); +label_disk=rand(10,num_total_samples); + +chunksz=100; +created_flag=false; +totalct=0; +for batchno=1:num_total_samples/chunksz + fprintf('batch no. %d\n', batchno); + last_read=(batchno-1)*chunksz; + + % to simulate maximum data to be held in memory before dumping to hdf5 file + batchdata=data_disk(:,:,1,last_read+1:last_read+chunksz); + batchlabs=label_disk(:,last_read+1:last_read+chunksz); + + % store to hdf5 + startloc=struct('dat',[1,1,1,totalct+1], 'lab', [1,totalct+1]); + curr_dat_sz=store2hdf5(filename, batchdata, batchlabs, ~created_flag, startloc, chunksz); + created_flag=true;% flag set so that file is created only once + totalct=curr_dat_sz(end);% updated dataset size (#samples) +end + +% display structure of the stored HDF5 file +h5disp(filename); + +%% READING FROM HDF5 + +% Read data and labels for samples #1000 to 1999 +data_rd=h5read(filename, '/data', [1 1 1 1000], [5, 5, 1, 1000]); +label_rd=h5read(filename, '/label', [1 1000], [10, 1000]); +fprintf('Testing ...\n'); +try + assert(isequal(data_rd, single(data_disk(:,:,:,1000:1999))), 'Data do not match'); + assert(isequal(label_rd, single(label_disk(:,1000:1999))), 'Labels do not match'); + + fprintf('Success!\n'); +catch err + fprintf('Test failed ...\n'); + getReport(err) +end + +%delete(filename); + +% CREATE list.txt containing filename, to be used as source for HDF5_DATA_LAYER +FILE=fopen('list.txt', 'w'); +fprintf(FILE, '%s', filename); +fclose(FILE); +fprintf('HDF5 filename listed in %s \n', 'list.txt'); + +% NOTE: In net definition prototxt, use list.txt as input to HDF5_DATA as: +% layers { +% name: "data" +% type: HDF5_DATA +% top: "data" +% top: "labelvec" +% hdf5_data_param { +% source: "/path/to/list.txt" +% batch_size: 64 +% } +% } diff --git a/matlab/caffe/hdf5creation/store2hdf5.m b/matlab/caffe/hdf5creation/store2hdf5.m new file mode 100644 index 00000000..0a0016dc --- /dev/null +++ b/matlab/caffe/hdf5creation/store2hdf5.m @@ -0,0 +1,59 @@ +function [curr_dat_sz, curr_lab_sz] = store2hdf5(filename, data, labels, create, startloc, chunksz) + % *data* is W*H*C*N matrix of images should be normalized (e.g. to lie between 0 and 1) beforehand + % *label* is D*N matrix of labels (D labels per sample) + % *create* [0/1] specifies whether to create file newly or to append to previously created file, useful to store information in batches when a dataset is too big to be held in memory (default: 1) + % *startloc* (point at which to start writing data). By default, + % if create=1 (create mode), startloc.data=[1 1 1 1], and startloc.lab=[1 1]; + % if create=0 (append mode), startloc.data=[1 1 1 K+1], and startloc.lab = [1 K+1]; where K is the current number of samples stored in the HDF + % chunksz (used only in create mode), specifies number of samples to be stored per chunk (see HDF5 documentation on chunking) for creating HDF5 files with unbounded maximum size - TLDR; higher chunk sizes allow faster read-write operations + + % verify that format is right + dat_dims=size(data); + lab_dims=size(labels); + num_samples=dat_dims(end); + + assert(lab_dims(end)==num_samples, 'Number of samples should be matched between data and labels'); + + if ~exist('create','var') + create=true; + end + + + if create + %fprintf('Creating dataset with %d samples\n', num_samples); + if ~exist('chunksz', 'var') + chunksz=1000; + end + if exist(filename, 'file') + fprintf('Warning: replacing existing file %s \n', filename); + delete(filename); + end + h5create(filename, '/data', [dat_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [dat_dims(1:end-1) chunksz]); % width, height, channels, number + h5create(filename, '/label', [lab_dims(1:end-1) Inf], 'Datatype', 'single', 'ChunkSize', [lab_dims(1:end-1) chunksz]); % width, height, channels, number + if ~exist('startloc','var') + startloc.dat=[ones(1,length(dat_dims)-1), 1]; + startloc.lab=[ones(1,length(lab_dims)-1), 1]; + end + else % append mode + if ~exist('startloc','var') + info=h5info(filename); + prev_dat_sz=info.Datasets(1).Dataspace.Size; + prev_lab_sz=info.Datasets(2).Dataspace.Size; + assert(prev_dat_sz(1:end-1)==dat_dims(1:end-1), 'Data dimensions must match existing dimensions in dataset'); + assert(prev_lab_sz(1:end-1)==lab_dims(1:end-1), 'Label dimensions must match existing dimensions in dataset'); + startloc.dat=[ones(1,length(dat_dims)-1), prev_dat_sz(end)+1]; + startloc.lab=[ones(1,length(lab_dims)-1), prev_lab_sz(end)+1]; + end + end + + if ~isempty(data) + h5write(filename, '/data', single(data), startloc.dat, size(data)); + h5write(filename, '/label', single(labels), startloc.lab, size(labels)); + end + + if nargout + info=h5info(filename); + curr_dat_sz=info.Datasets(1).Dataspace.Size; + curr_lab_sz=info.Datasets(2).Dataspace.Size; + end +end |