flickr style fine-tuning model (separated from example read me)

author: Sergey Karayev <sergeykarayev@gmail.com> 2014-09-04 02:50:53 +0100
committer: Sergey Karayev <sergeykarayev@gmail.com> 2014-09-04 03:59:14 +0100
commit: c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19 (patch)
tree: 8dedeae1ad6e6af3be84a4bab3bb6d1682cad251 /examples
parent: bc601e9060cc71b95f0250b9efacaffcbb5d8c0e (diff)
download: caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.tar.gz
caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.tar.bz2
caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.zip
3 files changed, 11 insertions, 371 deletions
diff --git a/examples/finetune_flickr_style/flickr_style_solver.prototxt b/examples/finetune_flickr_style/flickr_style_solver.prototxt
deleted file mode 100644
index 756e162b..00000000
--- a/examples/finetune_flickr_style/flickr_style_solver.prototxt
+++ /dev/null
@@ -1,17 +0,0 @@
-net: "examples/finetune_flickr_style/flickr_style_train_val.prototxt"
-test_iter: 100
-test_interval: 1000
-# lr for fine-tuning should be lower than when starting from scratch
-base_lr: 0.001
-lr_policy: "step"
-gamma: 0.1
-# stepsize should also be lower, as we're closer to being done
-stepsize: 20000
-display: 20
-max_iter: 100000
-momentum: 0.9
-weight_decay: 0.0005
-snapshot: 10000
-snapshot_prefix: "examples/finetune_flickr_style/flickr_style"
-# uncomment the following to default to CPU mode solving
-# solver_mode: CPU
diff --git a/examples/finetune_flickr_style/flickr_style_train_val.prototxt b/examples/finetune_flickr_style/flickr_style_train_val.prototxt
deleted file mode 100644
index 46a198a8..00000000
--- a/examples/finetune_flickr_style/flickr_style_train_val.prototxt
+++ /dev/null
@@ -1,349 +0,0 @@
-name: "FlickrStyleCaffeNet"
-layers {
-  name: "data"
-  type: IMAGE_DATA
-  top: "data"
-  top: "label"
-  image_data_param {
-    source: "data/flickr_style/train.txt"
-    batch_size: 50
-    new_height: 256
-    new_width: 256
-  }
-  transform_param {
-    crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
-    mirror: true
-  }
-  include: { phase: TRAIN }
-}
-layers {
-  name: "data"
-  type: IMAGE_DATA
-  top: "data"
-  top: "label"
-  image_data_param {
-    source: "data/flickr_style/train.txt"
-    batch_size: 50
-    new_height: 256
-    new_width: 256
-  }
-  transform_param {
-    crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
-    mirror: false
-  }
-  include: { phase: TEST }
-}
-layers {
-  name: "conv1"
-  type: CONVOLUTION
-  bottom: "data"
-  top: "conv1"
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  convolution_param {
-    num_output: 96
-    kernel_size: 11
-    stride: 4
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layers {
-  name: "relu1"
-  type: RELU
-  bottom: "conv1"
-  top: "conv1"
-}
-layers {
-  name: "pool1"
-  type: POOLING
-  bottom: "conv1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layers {
-  name: "norm1"
-  type: LRN
-  bottom: "pool1"
-  top: "norm1"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layers {
-  name: "conv2"
-  type: CONVOLUTION
-  bottom: "norm1"
-  top: "conv2"
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  convolution_param {
-    num_output: 256
-    pad: 2
-    kernel_size: 5
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 1
-    }
-  }
-}
-layers {
-  name: "relu2"
-  type: RELU
-  bottom: "conv2"
-  top: "conv2"
-}
-layers {
-  name: "pool2"
-  type: POOLING
-  bottom: "conv2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layers {
-  name: "norm2"
-  type: LRN
-  bottom: "pool2"
-  top: "norm2"
-  lrn_param {
-    local_size: 5
-    alpha: 0.0001
-    beta: 0.75
-  }
-}
-layers {
-  name: "conv3"
-  type: CONVOLUTION
-  bottom: "norm2"
-  top: "conv3"
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layers {
-  name: "relu3"
-  type: RELU
-  bottom: "conv3"
-  top: "conv3"
-}
-layers {
-  name: "conv4"
-  type: CONVOLUTION
-  bottom: "conv3"
-  top: "conv4"
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  convolution_param {
-    num_output: 384
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 1
-    }
-  }
-}
-layers {
-  name: "relu4"
-  type: RELU
-  bottom: "conv4"
-  top: "conv4"
-}
-layers {
-  name: "conv5"
-  type: CONVOLUTION
-  bottom: "conv4"
-  top: "conv5"
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 2
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 1
-    }
-  }
-}
-layers {
-  name: "relu5"
-  type: RELU
-  bottom: "conv5"
-  top: "conv5"
-}
-layers {
-  name: "pool5"
-  type: POOLING
-  bottom: "conv5"
-  top: "pool5"
-  pooling_param {
-    pool: MAX
-    kernel_size: 3
-    stride: 2
-  }
-}
-layers {
-  name: "fc6"
-  type: INNER_PRODUCT
-  bottom: "pool5"
-  top: "fc6"
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 1
-    }
-  }
-}
-layers {
-  name: "relu6"
-  type: RELU
-  bottom: "fc6"
-  top: "fc6"
-}
-layers {
-  name: "drop6"
-  type: DROPOUT
-  bottom: "fc6"
-  top: "fc6"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layers {
-  name: "fc7"
-  type: INNER_PRODUCT
-  bottom: "fc6"
-  top: "fc7"
-  # Note that blobs_lr can be set to 0 to disable any fine-tuning of this, and any other, layer
-  blobs_lr: 1
-  blobs_lr: 2
-  weight_decay: 1
-  weight_decay: 0
-  inner_product_param {
-    num_output: 4096
-    weight_filler {
-      type: "gaussian"
-      std: 0.005
-    }
-    bias_filler {
-      type: "constant"
-      value: 1
-    }
-  }
-}
-layers {
-  name: "relu7"
-  type: RELU
-  bottom: "fc7"
-  top: "fc7"
-}
-layers {
-  name: "drop7"
-  type: DROPOUT
-  bottom: "fc7"
-  top: "fc7"
-  dropout_param {
-    dropout_ratio: 0.5
-  }
-}
-layers {
-  name: "fc8_flickr"
-  type: INNER_PRODUCT
-  bottom: "fc7"
-  top: "fc8_flickr"
-  # blobs_lr is set to higher than for other layers, because this layer is starting from random while the others are already trained
-  blobs_lr: 10
-  blobs_lr: 20
-  weight_decay: 1
-  weight_decay: 0
-  inner_product_param {
-    num_output: 20
-    weight_filler {
-      type: "gaussian"
-      std: 0.01
-    }
-    bias_filler {
-      type: "constant"
-      value: 0
-    }
-  }
-}
-layers {
-  name: "loss"
-  type: SOFTMAX_LOSS
-  bottom: "fc8_flickr"
-  bottom: "label"
-}
-layers {
-  name: "accuracy"
-  type: ACCURACY
-  bottom: "fc8_flickr"
-  bottom: "label"
-  top: "accuracy"
-  include: { phase: TEST }
-}
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 68b2778c..2c9ee0a8 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -34,7 +34,7 @@ All steps are to be done from the caffe root directory.
 The dataset is distributed as a list of URLs with corresponding labels.
 Using a script, we will download a small subset of the data and split it into train and val sets.
 
-    caffe % ./examples/finetune_flickr_style/assemble_data.py -h
+    caffe % ./models/finetune_flickr_style/assemble_data.py -h
     usage: assemble_data.py [-h] [-s SEED] [-i IMAGES] [-w WORKERS]
 
     Download a subset of Flickr Style to a directory
@@ -48,7 +48,7 @@ Using a script, we will download a small subset of the data and split it into tr
                             num workers used to download images. -x uses (all - x)
                             cores.
 
-    caffe % python examples/finetune_flickr_style/assemble_data.py --workers=-1 --images=2000 --seed 831486
+    caffe % python models/finetune_flickr_style/assemble_data.py --workers=-1 --images=2000 --seed 831486
     Downloading 2000 images with 7 workers...
     Writing train/val for 1939 successfully downloaded images.
 
@@ -56,11 +56,11 @@ This script downloads images and writes train/val file lists into `data/flickr_s
 With this random seed there are 1,557 train images and 382 test images.
 The prototxts in this example assume this, and also assume the presence of the ImageNet mean file (run `get_ilsvrc_aux.sh` from `data/ilsvrc12` to obtain this if you haven't yet).
 
-We'll also need the ImageNet-trained model, which you can obtain by running `get_caffe_reference_imagenet_model.sh` from `examples/imagenet`.
+We'll also need the ImageNet-trained model, which you can obtain by running `get_caffe_reference_imagenet_model.sh` from `models/imagenet`.
 
 Now we can train! (You can fine-tune in CPU mode by leaving out the `-gpu` flag.)
 
-    caffe % ./build/tools/caffe train -solver examples/finetune_flickr_style/flickr_style_solver.prototxt -weights models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel -gpu 0
+    caffe % ./build/tools/caffe train -solver models/finetune_flickr_style/flickr_style_solver.prototxt -weights models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel -gpu 0
 
     [...]
 
@@ -149,10 +149,16 @@ This model is only beginning to learn.
 Fine-tuning can be feasible when training from scratch would not be for lack of time or data.
 Even in CPU mode each pass through the training set takes ~100 s. GPU fine-tuning is of course faster still and can learn a useful model in minutes or hours instead of days or weeks.
 Furthermore, note that the model has only trained on < 2,000 instances. Transfer learning a new task like style recognition from the ImageNet pretraining can require much less data than training from scratch.
+
 Now try fine-tuning to your own tasks and data!
 
+## Trained model
+
+We provide a model trained on all 80K images, with final accuracy of 98%.
+Simply do `./scripts/download_model_binary.py models/finetune_flickr_style` to obtain it.
+
 ## License
 
 The Flickr Style dataset as distributed here contains only URLs to images.
 Some of the images may have copyright.
-Training a category-recognition model for research/non-commercial use may constitute fair use of this data.
+Training a category-recognition model for research/non-commercial use may constitute fair use of this data, but the result should not be used for commercial purposes.
author	Sergey Karayev <sergeykarayev@gmail.com>	2014-09-04 02:50:53 +0100
committer	Sergey Karayev <sergeykarayev@gmail.com>	2014-09-04 03:59:14 +0100
commit	c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19 (patch)
tree	8dedeae1ad6e6af3be84a4bab3bb6d1682cad251 /examples
parent	bc601e9060cc71b95f0250b9efacaffcbb5d8c0e (diff)
download	caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.tar.gz caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.tar.bz2 caffeonacl-c6827bf3fcd8bf6cafb2aeb7a1ce4d3a958b7f19.zip