summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorSergey Karayev <sergeykarayev@gmail.com>2014-09-04 02:50:53 +0100
committerSergey Karayev <sergeykarayev@gmail.com>2014-09-04 03:59:14 +0100
commitc6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19 (patch)
tree8dedeae1ad6e6af3be84a4bab3bb6d1682cad251 /examples
parentbc601e9060cc71b95f0250b9efacaffcbb5d8c0e (diff)
downloadcaffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.tar.gz
caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.tar.bz2
caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.zip
flickr style fine-tuning model (separated from example read me)
Diffstat (limited to 'examples')
-rw-r--r--examples/finetune_flickr_style/flickr_style_solver.prototxt17
-rw-r--r--examples/finetune_flickr_style/flickr_style_train_val.prototxt349
-rw-r--r--examples/finetune_flickr_style/readme.md16
3 files changed, 11 insertions, 371 deletions
diff --git a/examples/finetune_flickr_style/flickr_style_solver.prototxt b/examples/finetune_flickr_style/flickr_style_solver.prototxt
deleted file mode 100644
index 756e162b..00000000
--- a/examples/finetune_flickr_style/flickr_style_solver.prototxt
+++ /dev/null
@@ -1,17 +0,0 @@
-net: "examples/finetune_flickr_style/flickr_style_train_val.prototxt"
-test_iter: 100
-test_interval: 1000
-# lr for fine-tuning should be lower than when starting from scratch
-base_lr: 0.001
-lr_policy: "step"
-gamma: 0.1
-# stepsize should also be lower, as we're closer to being done
-stepsize: 20000
-display: 20
-max_iter: 100000
-momentum: 0.9
-weight_decay: 0.0005
-snapshot: 10000
-snapshot_prefix: "examples/finetune_flickr_style/flickr_style"
-# uncomment the following to default to CPU mode solving
-# solver_mode: CPU
diff --git a/examples/finetune_flickr_style/flickr_style_train_val.prototxt b/examples/finetune_flickr_style/flickr_style_train_val.prototxt
deleted file mode 100644
index 46a198a8..00000000
--- a/examples/finetune_flickr_style/flickr_style_train_val.prototxt
+++ /dev/null
@@ -1,349 +0,0 @@
-name: "FlickrStyleCaffeNet"
-layers {
- name: "data"
- type: IMAGE_DATA
- top: "data"
- top: "label"
- image_data_param {
- source: "data/flickr_style/train.txt"
- batch_size: 50
- new_height: 256
- new_width: 256
- }
- transform_param {
- crop_size: 227
- mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
- mirror: true
- }
- include: { phase: TRAIN }
-}
-layers {
- name: "data"
- type: IMAGE_DATA
- top: "data"
- top: "label"
- image_data_param {
- source: "data/flickr_style/train.txt"
- batch_size: 50
- new_height: 256
- new_width: 256
- }
- transform_param {
- crop_size: 227
- mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
- mirror: false
- }
- include: { phase: TEST }
-}
-layers {
- name: "conv1"
- type: CONVOLUTION
- bottom: "data"
- top: "conv1"
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- convolution_param {
- num_output: 96
- kernel_size: 11
- stride: 4
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 0
- }
- }
-}
-layers {
- name: "relu1"
- type: RELU
- bottom: "conv1"
- top: "conv1"
-}
-layers {
- name: "pool1"
- type: POOLING
- bottom: "conv1"
- top: "pool1"
- pooling_param {
- pool: MAX
- kernel_size: 3
- stride: 2
- }
-}
-layers {
- name: "norm1"
- type: LRN
- bottom: "pool1"
- top: "norm1"
- lrn_param {
- local_size: 5
- alpha: 0.0001
- beta: 0.75
- }
-}
-layers {
- name: "conv2"
- type: CONVOLUTION
- bottom: "norm1"
- top: "conv2"
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- convolution_param {
- num_output: 256
- pad: 2
- kernel_size: 5
- group: 2
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 1
- }
- }
-}
-layers {
- name: "relu2"
- type: RELU
- bottom: "conv2"
- top: "conv2"
-}
-layers {
- name: "pool2"
- type: POOLING
- bottom: "conv2"
- top: "pool2"
- pooling_param {
- pool: MAX
- kernel_size: 3
- stride: 2
- }
-}
-layers {
- name: "norm2"
- type: LRN
- bottom: "pool2"
- top: "norm2"
- lrn_param {
- local_size: 5
- alpha: 0.0001
- beta: 0.75
- }
-}
-layers {
- name: "conv3"
- type: CONVOLUTION
- bottom: "norm2"
- top: "conv3"
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- convolution_param {
- num_output: 384
- pad: 1
- kernel_size: 3
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 0
- }
- }
-}
-layers {
- name: "relu3"
- type: RELU
- bottom: "conv3"
- top: "conv3"
-}
-layers {
- name: "conv4"
- type: CONVOLUTION
- bottom: "conv3"
- top: "conv4"
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- convolution_param {
- num_output: 384
- pad: 1
- kernel_size: 3
- group: 2
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 1
- }
- }
-}
-layers {
- name: "relu4"
- type: RELU
- bottom: "conv4"
- top: "conv4"
-}
-layers {
- name: "conv5"
- type: CONVOLUTION
- bottom: "conv4"
- top: "conv5"
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- convolution_param {
- num_output: 256
- pad: 1
- kernel_size: 3
- group: 2
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 1
- }
- }
-}
-layers {
- name: "relu5"
- type: RELU
- bottom: "conv5"
- top: "conv5"
-}
-layers {
- name: "pool5"
- type: POOLING
- bottom: "conv5"
- top: "pool5"
- pooling_param {
- pool: MAX
- kernel_size: 3
- stride: 2
- }
-}
-layers {
- name: "fc6"
- type: INNER_PRODUCT
- bottom: "pool5"
- top: "fc6"
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- inner_product_param {
- num_output: 4096
- weight_filler {
- type: "gaussian"
- std: 0.005
- }
- bias_filler {
- type: "constant"
- value: 1
- }
- }
-}
-layers {
- name: "relu6"
- type: RELU
- bottom: "fc6"
- top: "fc6"
-}
-layers {
- name: "drop6"
- type: DROPOUT
- bottom: "fc6"
- top: "fc6"
- dropout_param {
- dropout_ratio: 0.5
- }
-}
-layers {
- name: "fc7"
- type: INNER_PRODUCT
- bottom: "fc6"
- top: "fc7"
- # Note that blobs_lr can be set to 0 to disable any fine-tuning of this, and any other, layer
- blobs_lr: 1
- blobs_lr: 2
- weight_decay: 1
- weight_decay: 0
- inner_product_param {
- num_output: 4096
- weight_filler {
- type: "gaussian"
- std: 0.005
- }
- bias_filler {
- type: "constant"
- value: 1
- }
- }
-}
-layers {
- name: "relu7"
- type: RELU
- bottom: "fc7"
- top: "fc7"
-}
-layers {
- name: "drop7"
- type: DROPOUT
- bottom: "fc7"
- top: "fc7"
- dropout_param {
- dropout_ratio: 0.5
- }
-}
-layers {
- name: "fc8_flickr"
- type: INNER_PRODUCT
- bottom: "fc7"
- top: "fc8_flickr"
- # blobs_lr is set to higher than for other layers, because this layer is starting from random while the others are already trained
- blobs_lr: 10
- blobs_lr: 20
- weight_decay: 1
- weight_decay: 0
- inner_product_param {
- num_output: 20
- weight_filler {
- type: "gaussian"
- std: 0.01
- }
- bias_filler {
- type: "constant"
- value: 0
- }
- }
-}
-layers {
- name: "loss"
- type: SOFTMAX_LOSS
- bottom: "fc8_flickr"
- bottom: "label"
-}
-layers {
- name: "accuracy"
- type: ACCURACY
- bottom: "fc8_flickr"
- bottom: "label"
- top: "accuracy"
- include: { phase: TEST }
-}
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 68b2778c..2c9ee0a8 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -34,7 +34,7 @@ All steps are to be done from the caffe root directory.
The dataset is distributed as a list of URLs with corresponding labels.
Using a script, we will download a small subset of the data and split it into train and val sets.
- caffe % ./examples/finetune_flickr_style/assemble_data.py -h
+ caffe % ./models/finetune_flickr_style/assemble_data.py -h
usage: assemble_data.py [-h] [-s SEED] [-i IMAGES] [-w WORKERS]
Download a subset of Flickr Style to a directory
@@ -48,7 +48,7 @@ Using a script, we will download a small subset of the data and split it into tr
num workers used to download images. -x uses (all - x)
cores.
- caffe % python examples/finetune_flickr_style/assemble_data.py --workers=-1 --images=2000 --seed 831486
+ caffe % python models/finetune_flickr_style/assemble_data.py --workers=-1 --images=2000 --seed 831486
Downloading 2000 images with 7 workers...
Writing train/val for 1939 successfully downloaded images.
@@ -56,11 +56,11 @@ This script downloads images and writes train/val file lists into `data/flickr_s
With this random seed there are 1,557 train images and 382 test images.
The prototxts in this example assume this, and also assume the presence of the ImageNet mean file (run `get_ilsvrc_aux.sh` from `data/ilsvrc12` to obtain this if you haven't yet).
-We'll also need the ImageNet-trained model, which you can obtain by running `get_caffe_reference_imagenet_model.sh` from `examples/imagenet`.
+We'll also need the ImageNet-trained model, which you can obtain by running `get_caffe_reference_imagenet_model.sh` from `models/imagenet`.
Now we can train! (You can fine-tune in CPU mode by leaving out the `-gpu` flag.)
- caffe % ./build/tools/caffe train -solver examples/finetune_flickr_style/flickr_style_solver.prototxt -weights models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel -gpu 0
+ caffe % ./build/tools/caffe train -solver models/finetune_flickr_style/flickr_style_solver.prototxt -weights models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel -gpu 0
[...]
@@ -149,10 +149,16 @@ This model is only beginning to learn.
Fine-tuning can be feasible when training from scratch would not be for lack of time or data.
Even in CPU mode each pass through the training set takes ~100 s. GPU fine-tuning is of course faster still and can learn a useful model in minutes or hours instead of days or weeks.
Furthermore, note that the model has only trained on < 2,000 instances. Transfer learning a new task like style recognition from the ImageNet pretraining can require much less data than training from scratch.
+
Now try fine-tuning to your own tasks and data!
+## Trained model
+
+We provide a model trained on all 80K images, with final accuracy of 98%.
+Simply do `./scripts/download_model_binary.py models/finetune_flickr_style` to obtain it.
+
## License
The Flickr Style dataset as distributed here contains only URLs to images.
Some of the images may have copyright.
-Training a category-recognition model for research/non-commercial use may constitute fair use of this data.
+Training a category-recognition model for research/non-commercial use may constitute fair use of this data, but the result should not be used for commercial purposes.