#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ #define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ #include #include #include #include "caffe/proto/caffe.pb.h" #include "caffe2/core/db.h" #include "caffe2/utils/cast.h" #include "caffe2/utils/math.h" #include "caffe2/utils/thread_pool.h" #include "caffe2/operators/prefetch_op.h" #include "caffe2/image/transform_gpu.h" namespace caffe2 { class CUDAContext; template class ImageInputOp final : public PrefetchOperator { // SINGLE_LABEL: single integer label for multi-class classification // MULTI_LABEL_SPARSE: sparse active label indices for multi-label classification // MULTI_LABEL_DENSE: dense label embedding vector for label embedding regression // MULTI_LABEL_WEIGHTED_SPARSE: sparse active label indices with per-label weights // for multi-label classification // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification with weighted sampling enum LABEL_TYPE { SINGLE_LABEL = 0, MULTI_LABEL_SPARSE = 1, MULTI_LABEL_DENSE = 2, MULTI_LABEL_WEIGHTED_SPARSE = 3, SINGLE_LABEL_WEIGHTED = 4 }; // INCEPTION_STYLE: Random crop with size 8% - 100% image area and aspect // ratio in [3/4, 4/3]. Reference: GoogleNet paper enum SCALE_JITTER_TYPE { NO_SCALE_JITTER = 0, INCEPTION_STYLE = 1 // TODO(zyan3): ResNet-style random scale jitter }; public: using OperatorBase::OutputSize; using PrefetchOperator::context_; using PrefetchOperator::prefetch_thread_; explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws); ~ImageInputOp() { PrefetchOperator::Finalize(); } bool Prefetch() override; bool CopyPrefetched() override; private: using BoundingBox = struct { bool valid; int ymin; int xmin; int height; int width; }; // Structure to store per-image information // This can be modified by the DecodeAnd* so needs // to be privatized per launch. using PerImageArg = struct { BoundingBox bounding_params; }; bool GetImageAndLabelAndInfoFromDBValue( const string& value, cv::Mat* img, PerImageArg& info, int item_id, std::mt19937* randgen); void DecodeAndTransform( const std::string& value, float *image_data, int item_id, const int channels, std::size_t thread_index); void DecodeAndTransposeOnly( const std::string& value, uint8_t *image_data, int item_id, const int channels, std::size_t thread_index); unique_ptr owned_reader_; const db::DBReader* reader_; CPUContext cpu_context_; TensorCPU prefetched_image_; TensorCPU prefetched_label_; vector prefetched_additional_outputs_; Tensor prefetched_image_on_device_; Tensor prefetched_label_on_device_; vector> prefetched_additional_outputs_on_device_; // Default parameters for images PerImageArg default_arg_; int batch_size_; LABEL_TYPE label_type_; int num_labels_; bool color_; bool color_jitter_; float img_saturation_; float img_brightness_; float img_contrast_; bool color_lighting_; float color_lighting_std_; std::vector> color_lighting_eigvecs_; std::vector color_lighting_eigvals_; SCALE_JITTER_TYPE scale_jitter_type_; int scale_; // Minsize is similar to scale except that it will only // force the image to scale up if it is too small. In other words, // it ensures that both dimensions of the image are at least minsize_ int minsize_; bool warp_; int crop_; std::vector mean_; std::vector std_; Tensor mean_gpu_; Tensor std_gpu_; bool mirror_; bool is_test_; bool use_caffe_datum_; bool gpu_transform_; bool mean_std_copied_ = false; // thread pool for parse + decode int num_decode_threads_; int additional_inputs_offset_; int additional_inputs_count_; std::shared_ptr thread_pool_; // Output type for GPU transform path TensorProto_DataType output_type_; // random minsize vector random_scale_; bool random_scaling_; // Working variables std::vector randgen_per_thread_; }; template ImageInputOp::ImageInputOp( const OperatorDef& operator_def, Workspace* ws) : PrefetchOperator(operator_def, ws), reader_(nullptr), prefetched_additional_outputs_(OutputSize() - 2), prefetched_additional_outputs_on_device_(OutputSize() - 2), batch_size_( OperatorBase::template GetSingleArgument("batch_size", 0)), label_type_(static_cast( OperatorBase::template GetSingleArgument("label_type", 0))), num_labels_( OperatorBase::template GetSingleArgument("num_labels", 0)), color_(OperatorBase::template GetSingleArgument("color", 1)), color_jitter_( OperatorBase::template GetSingleArgument("color_jitter", 0)), img_saturation_(OperatorBase::template GetSingleArgument( "img_saturation", 0.4)), img_brightness_(OperatorBase::template GetSingleArgument( "img_brightness", 0.4)), img_contrast_( OperatorBase::template GetSingleArgument("img_contrast", 0.4)), color_lighting_( OperatorBase::template GetSingleArgument("color_lighting", 0)), color_lighting_std_(OperatorBase::template GetSingleArgument( "color_lighting_std", 0.1)), scale_jitter_type_(static_cast( OperatorBase::template GetSingleArgument( "scale_jitter_type", 0))), scale_(OperatorBase::template GetSingleArgument("scale", -1)), minsize_(OperatorBase::template GetSingleArgument("minsize", -1)), warp_(OperatorBase::template GetSingleArgument("warp", 0)), crop_(OperatorBase::template GetSingleArgument("crop", -1)), mirror_(OperatorBase::template GetSingleArgument("mirror", 0)), is_test_(OperatorBase::template GetSingleArgument( OpSchema::Arg_IsTest, 0)), use_caffe_datum_( OperatorBase::template GetSingleArgument("use_caffe_datum", 0)), gpu_transform_(OperatorBase::template GetSingleArgument( "use_gpu_transform", 0)), num_decode_threads_( OperatorBase::template GetSingleArgument("decode_threads", 4)), thread_pool_(std::make_shared(num_decode_threads_)), // output type only supported with CUDA and use_gpu_transform for now output_type_( cast::GetCastDataType(ArgumentHelper(operator_def), "output_type")), random_scale_( OperatorBase::template GetRepeatedArgument("random_scale", {-1,-1})) { if ((random_scale_[0] == -1) || (random_scale_[1] == -1)) { random_scaling_ = false; } else { random_scaling_ = true; minsize_ = random_scale_[0]; } mean_ = OperatorBase::template GetRepeatedArgument( "mean_per_channel", {OperatorBase::template GetSingleArgument("mean", 0.)}); std_ = OperatorBase::template GetRepeatedArgument( "std_per_channel", {OperatorBase::template GetSingleArgument("std", 1.)}); vector additional_output_sizes = OperatorBase::template GetRepeatedArgument( "output_sizes", vector(OutputSize() - 2, 1)); additional_inputs_count_ = OutputSize() - 2; default_arg_.bounding_params = { false, OperatorBase::template GetSingleArgument("bounding_ymin", -1), OperatorBase::template GetSingleArgument("bounding_xmin", -1), OperatorBase::template GetSingleArgument("bounding_height", -1), OperatorBase::template GetSingleArgument("bounding_width", -1), }; if (operator_def.input_size() == 0) { LOG(ERROR) << "You are using an old ImageInputOp format that creates " "a local db reader. Consider moving to the new style " "that takes in a DBReader blob instead."; string db_name = OperatorBase::template GetSingleArgument("db", ""); CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name."); owned_reader_.reset(new db::DBReader( OperatorBase::template GetSingleArgument( "db_type", "leveldb"), db_name)); reader_ = owned_reader_.get(); } // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order color_lighting_eigvecs_.push_back( std::vector{-144.7125, 183.396, 102.2295}); color_lighting_eigvecs_.push_back( std::vector{-148.104, -1.1475, -207.57}); color_lighting_eigvecs_.push_back( std::vector{-148.818, -177.174, 107.1765}); color_lighting_eigvals_ = std::vector{0.2175, 0.0188, 0.0045}; CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative."); if (use_caffe_datum_) { CAFFE_ENFORCE(label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED, "Caffe datum only supports single integer label"); } if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) { CAFFE_ENFORCE_GT(num_labels_, 0, "Number of labels must be set for using either sparse label indices or dense label embedding."); } if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE || label_type_ == SINGLE_LABEL_WEIGHTED) { additional_inputs_offset_ = 3; } else { additional_inputs_offset_ = 2; } CAFFE_ENFORCE((scale_ > 0) != (minsize_ > 0), "Must provide one and only one of scaling or minsize"); CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value."); CAFFE_ENFORCE_GE( scale_ > 0 ? scale_ : minsize_, crop_, "The scale/minsize value must be no smaller than the crop value."); CAFFE_ENFORCE_EQ( mean_.size(), std_.size(), "The mean and std. dev vectors must be of the same size."); CAFFE_ENFORCE(mean_.size() == 1 || mean_.size() == 3, "The mean and std. dev vectors must be of size 1 or 3"); CAFFE_ENFORCE( !use_caffe_datum_ || OutputSize() == 2, "There can only be 2 outputs if the Caffe datum format is used"); CAFFE_ENFORCE( additional_output_sizes.size() == OutputSize() - 2, "If the output sizes are specified, they must be specified for all " "additional outputs"); CAFFE_ENFORCE(random_scale_.size() == 2, "Must provide [scale_min, scale_max]"); CAFFE_ENFORCE_GE(random_scale_[1], random_scale_[0], "random scale must provide a range [min, max]"); if (default_arg_.bounding_params.ymin < 0 || default_arg_.bounding_params.xmin < 0 || default_arg_.bounding_params.height < 0 || default_arg_.bounding_params.width < 0) { default_arg_.bounding_params.valid = false; } else { default_arg_.bounding_params.valid = true; } if (mean_.size() == 1) { // We are going to extend to 3 using the first value mean_.resize(3, mean_[0]); std_.resize(3, std_[0]); } LOG(INFO) << "Creating an image input op with the following setting: "; LOG(INFO) << " Using " << num_decode_threads_ << " CPU threads;"; if (gpu_transform_) { LOG(INFO) << " Performing transformation on GPU"; } LOG(INFO) << " Outputting in batches of " << batch_size_ << " images;"; LOG(INFO) << " Treating input image as " << (color_ ? "color " : "grayscale ") << "image;"; if (default_arg_.bounding_params.valid) { LOG(INFO) << " Applying a default bounding box of Y [" << default_arg_.bounding_params.ymin << "; " << default_arg_.bounding_params.ymin + default_arg_.bounding_params.height << ") x X [" << default_arg_.bounding_params.xmin << "; " << default_arg_.bounding_params.xmin + default_arg_.bounding_params.width << ")"; } if (scale_ > 0 && !random_scaling_) { LOG(INFO) << " Scaling image to " << scale_ << (warp_ ? " with " : " without ") << "warping;"; } else { if (random_scaling_) { // randomly set min_size_ for each image LOG(INFO) << " Randomly scaling shortest side between " << random_scale_[0] << " and " << random_scale_[1]; } else { // Here, minsize_ > 0 LOG(INFO) << " Ensuring minimum image size of " << minsize_ << (warp_ ? " with " : " without ") << "warping;"; } } LOG(INFO) << " " << (is_test_ ? "Central" : "Random") << " cropping image to " << crop_ << (mirror_ ? " with " : " without ") << "random mirroring;"; LOG(INFO) << "Label Type: " << label_type_; LOG(INFO) << "Num Labels: " << num_labels_; auto mit = mean_.begin(); auto sit = std_.begin(); for (int i = 0; mit != mean_.end() && sit != std_.end(); ++mit, ++sit, ++i) { LOG(INFO) << " Default [Channel " << i << "] Subtract mean " << *mit << " and divide by std " << *sit << "."; // We actually will use the inverse of std, so inverse it here *sit = 1.f / *sit; } LOG(INFO) << " Outputting images as " << OperatorBase::template GetSingleArgument("output_type", "unknown") << "."; std::mt19937 meta_randgen(time(nullptr)); for (int i = 0; i < num_decode_threads_; ++i) { randgen_per_thread_.emplace_back(meta_randgen()); } prefetched_image_.Resize( TIndex(batch_size_), TIndex(crop_), TIndex(crop_), TIndex(color_ ? 3 : 1)); if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) { prefetched_label_.Resize(TIndex(batch_size_), TIndex(num_labels_)); } else { prefetched_label_.Resize(vector(1, batch_size_)); } for (int i = 0; i < additional_output_sizes.size(); ++i) { prefetched_additional_outputs_[i].Resize( TIndex(batch_size_), TIndex(additional_output_sizes[i])); } } // Inception-stype scale jittering template bool RandomSizedCropping( cv::Mat* img, const int crop, std::mt19937* randgen ) { cv::Mat scaled_img; bool inception_scale_jitter = false; int im_height = img->rows, im_width = img->cols; int area = im_height * im_width; std::uniform_real_distribution<> area_dis(0.08, 1.0); std::uniform_real_distribution<> aspect_ratio_dis(3.0 / 4.0, 4.0 / 3.0); cv::Mat cropping; for (int i = 0; i < 10; ++i) { int target_area = int(ceil(area_dis(*randgen) * area)); float aspect_ratio = aspect_ratio_dis(*randgen); int nh = floor(std::sqrt(((float)target_area / aspect_ratio))); int nw = floor(std::sqrt(((float)target_area * aspect_ratio))); if (nh >= 1 && nh <= im_height && nw >=1 && nw <= im_width) { int height_offset = std::uniform_int_distribution<>( 0, im_height - nh)(*randgen); int width_offset = std::uniform_int_distribution<>( 0,im_width - nw)(*randgen); cv::Rect ROI(width_offset, height_offset, nw, nh); cropping = (*img)(ROI); cv::resize( cropping, scaled_img, cv::Size(crop, crop), 0, 0, cv::INTER_AREA); *img = scaled_img; inception_scale_jitter = true; break; } } return inception_scale_jitter; } template bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( const string& value, cv::Mat* img, PerImageArg& info, int item_id, std::mt19937* randgen) { // // recommend using --caffe2_use_fatal_for_enforce=1 when using ImageInputOp // as this function runs on a worker thread and the exceptions from // CAFFE_ENFORCE are silently dropped by the thread worker functions // cv::Mat src; // Use the default information for images info = default_arg_; if (use_caffe_datum_) { // The input is a caffe datum format. caffe::Datum datum; CAFFE_ENFORCE(datum.ParseFromString(value)); prefetched_label_.mutable_data()[item_id] = datum.label(); if (datum.encoded()) { // encoded image in datum. src = cv::imdecode( cv::Mat( 1, datum.data().size(), CV_8UC1, const_cast(datum.data().data())), color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); } else { // Raw image in datum. CAFFE_ENFORCE(datum.channels() == 3 || datum.channels() == 1); int src_c = datum.channels(); src.create( datum.height(), datum.width(), (src_c == 3) ? CV_8UC3 : CV_8UC1); if (src_c == 1) { memcpy(src.ptr(0), datum.data().data(), datum.data().size()); } else { // Datum stores things in CHW order, let's do HWC for images to make // things more consistent with conventional image storage. for (int c = 0; c < 3; ++c) { const char* datum_buffer = datum.data().data() + datum.height() * datum.width() * c; uchar* ptr = src.ptr(0) + c; for (int h = 0; h < datum.height(); ++h) { for (int w = 0; w < datum.width(); ++w) { *ptr = *(datum_buffer++); ptr += 3; } } } } } } else { // The input is a caffe2 format. TensorProtos protos; CAFFE_ENFORCE(protos.ParseFromString(value)); const TensorProto& image_proto = protos.protos(0); const TensorProto& label_proto = protos.protos(1); vector additional_output_protos; int start = additional_inputs_offset_; int end = start + additional_inputs_count_; for (int i = start; i < end; ++i) { additional_output_protos.push_back(protos.protos(i)); } if (protos.protos_size() == end + 1) { // We have bounding box information const TensorProto& bounding_proto = protos.protos(end); DCHECK_EQ(bounding_proto.data_type(), TensorProto::INT32); DCHECK_EQ(bounding_proto.int32_data_size(), 4); info.bounding_params.valid = true; info.bounding_params.ymin = bounding_proto.int32_data(0); info.bounding_params.xmin = bounding_proto.int32_data(1); info.bounding_params.height = bounding_proto.int32_data(2); info.bounding_params.width = bounding_proto.int32_data(3); } if (image_proto.data_type() == TensorProto::STRING) { // encoded image string. DCHECK_EQ(image_proto.string_data_size(), 1); const string& encoded_image_str = image_proto.string_data(0); int encoded_size = encoded_image_str.size(); // We use a cv::Mat to wrap the encoded str so we do not need a copy. src = cv::imdecode( cv::Mat( 1, &encoded_size, CV_8UC1, const_cast(encoded_image_str.data())), color_ ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); } else if (image_proto.data_type() == TensorProto::BYTE) { // raw image content. int src_c = (image_proto.dims_size() == 3) ? image_proto.dims(2) : 1; CAFFE_ENFORCE(src_c == 3 || src_c == 1); src.create( image_proto.dims(0), image_proto.dims(1), (src_c == 3) ? CV_8UC3 : CV_8UC1); memcpy( src.ptr(0), image_proto.byte_data().data(), image_proto.byte_data().size()); } else { LOG(FATAL) << "Unknown image data type."; } if (label_proto.data_type() == TensorProto::FLOAT) { if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) { DCHECK_EQ(label_proto.float_data_size(), 1); prefetched_label_.mutable_data()[item_id] = label_proto.float_data(0); } else if (label_type_ == MULTI_LABEL_SPARSE) { float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(float) * num_labels_); for (int i = 0; i < label_proto.float_data_size(); ++i) { label_data[(int)label_proto.float_data(i)] = 1.0; } } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) { const TensorProto& weight_proto = protos.protos(2); float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(float) * num_labels_); for (int i = 0; i < label_proto.float_data_size(); ++i) { label_data[(int)label_proto.float_data(i)] = weight_proto.float_data(i); } } else if (label_type_ == MULTI_LABEL_DENSE) { CAFFE_ENFORCE(label_proto.float_data_size() == num_labels_); float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; for (int i = 0; i < label_proto.float_data_size(); ++i) { label_data[i] = label_proto.float_data(i); } } else { LOG(ERROR) << "Unknown label type:" << label_type_; } } else if (label_proto.data_type() == TensorProto::INT32) { if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) { DCHECK_EQ(label_proto.int32_data_size(), 1); prefetched_label_.mutable_data()[item_id] = label_proto.int32_data(0); } else if (label_type_ == MULTI_LABEL_SPARSE) { int* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(int) * num_labels_); for (int i = 0; i < label_proto.int32_data_size(); ++i) { label_data[label_proto.int32_data(i)] = 1; } } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) { const TensorProto& weight_proto = protos.protos(2); float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(float) * num_labels_); for (int i = 0; i < label_proto.int32_data_size(); ++i) { label_data[label_proto.int32_data(i)] = weight_proto.float_data(i); } } else if (label_type_ == MULTI_LABEL_DENSE) { CAFFE_ENFORCE(label_proto.int32_data_size() == num_labels_); int* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; for (int i = 0; i < label_proto.int32_data_size(); ++i) { label_data[i] = label_proto.int32_data(i); } } else { LOG(ERROR) << "Unknown label type:" << label_type_; } } else { LOG(FATAL) << "Unsupported label data type."; } for (int i = 0; i < additional_output_protos.size(); ++i) { auto additional_output_proto = additional_output_protos[i]; if (additional_output_proto.data_type() == TensorProto::FLOAT) { float* additional_output = prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.float_data_size(); for (int j = 0; j < additional_output_proto.float_data_size(); ++j) { additional_output[j] = additional_output_proto.float_data(j); } } else if (additional_output_proto.data_type() == TensorProto::INT32) { int* additional_output = prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.int32_data_size(); for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) { additional_output[j] = additional_output_proto.int32_data(j); } } else if (additional_output_proto.data_type() == TensorProto::INT64) { int64_t* additional_output = prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.int64_data_size(); for (int j = 0; j < additional_output_proto.int64_data_size(); ++j) { additional_output[j] = additional_output_proto.int64_data(j); } } else { LOG(FATAL) << "Unsupported output type."; } } } // // convert source to the color format requested from Op // int out_c = color_ ? 3 : 1; if (out_c == src.channels()) { *img = src; } else { cv::cvtColor(src, *img, (out_c == 1) ? CV_BGR2GRAY : CV_GRAY2BGR); } // Note(Yangqing): I believe that the mat should be created continuous. CAFFE_ENFORCE(img->isContinuous()); // Sanity check now that we decoded everything // Ensure that the bounding box is legit if (info.bounding_params.valid && (src.rows < info.bounding_params.ymin + info.bounding_params.height || src.cols < info.bounding_params.xmin + info.bounding_params.width )) { info.bounding_params.valid = false; } // Apply the bounding box if requested if (info.bounding_params.valid) { // If we reach here, we know the parameters are sane cv::Rect bounding_box(info.bounding_params.xmin, info.bounding_params.ymin, info.bounding_params.width, info.bounding_params.height); *img = (*img)(bounding_box); /* LOG(INFO) << "Did bounding with ymin:" << info.bounding_params.ymin << " xmin:" << info.bounding_params.xmin << " height:" << info.bounding_params.height << " width:" << info.bounding_params.width << "\n"; LOG(INFO) << "Bounded matrix: " << img; */ } else { // LOG(INFO) << "No bounding\n"; } cv::Mat scaled_img; bool inception_scale_jitter = false; if (scale_jitter_type_ == INCEPTION_STYLE) { if (!is_test_) { // Inception-stype scale jittering is only used for training inception_scale_jitter = RandomSizedCropping(img, crop_, randgen); // if a random crop is still not found, do simple random cropping later } } if ((scale_jitter_type_ == NO_SCALE_JITTER) || (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) { int scaled_width, scaled_height; int scale_to_use = scale_ > 0 ? scale_ : minsize_; // set the random minsize if (random_scaling_) { scale_to_use = std::uniform_int_distribution<>(random_scale_[0], random_scale_[1])(*randgen); } if (warp_) { scaled_width = scale_to_use; scaled_height = scale_to_use; } else if (img->rows > img->cols) { scaled_width = scale_to_use; scaled_height = static_cast(img->rows) * scale_to_use / img->cols; } else { scaled_height = scale_to_use; scaled_width = static_cast(img->cols) * scale_to_use / img->rows; } if ((scale_ > 0 && (scaled_height != img->rows || scaled_width != img->cols)) || (scaled_height > img->rows || scaled_width > img->cols)) { // We rescale in all cases if we are using scale_ // but only to make the image bigger if using minsize_ /* LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height << " From " << img->cols << " x " << img->rows; */ cv::resize( *img, scaled_img, cv::Size(scaled_width, scaled_height), 0, 0, cv::INTER_AREA); *img = scaled_img; } } // TODO(Yangqing): return false if any error happens. return true; } // assume HWC order and color channels BGR template void Saturation( float* img, const int img_size, const float alpha_rand, std::mt19937* randgen ) { float alpha = 1.0f + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + img[3 * p + 2] * 0.299f; for (int c = 0; c < 3; ++c) { img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha); } p++; } } } // assume HWC order and color channels BGR template void Brightness( float* img, const int img_size, const float alpha_rand, std::mt19937* randgen ) { float alpha = 1.0f + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { for (int c = 0; c < 3; ++c) { img[p++] *= alpha; } } } } // assume HWC order and color channels BGR template void Contrast( float* img, const int img_size, const float alpha_rand, std::mt19937* randgen ){ float gray_mean = 0; int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + img[3 * p + 2] * 0.299f; p++; } } gray_mean /= (img_size * img_size); float alpha = 1.0f + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { for (int c = 0; c < 3; ++c) { img[p] = img[p] * alpha + gray_mean * (1.0f - alpha); p++; } } } } // assume HWC order and color channels BGR template void ColorJitter( float* img, const int img_size, const float saturation, const float brightness, const float contrast, std::mt19937* randgen ) { std::srand (unsigned(std::time(0))); std::vector jitter_order{0, 1, 2}; // obtain a time-based seed: unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::shuffle(jitter_order.begin(), jitter_order.end(), std::default_random_engine(seed)); for (int i = 0; i < 3; ++i) { if (jitter_order[i] == 0) { Saturation(img, img_size, saturation, randgen); } else if (jitter_order[i] == 1) { Brightness(img, img_size, brightness, randgen); } else { Contrast(img, img_size, contrast, randgen); } } } // assume HWC order and color channels BGR template void ColorLighting( float* img, const int img_size, const float alpha_std, const std::vector>& eigvecs, const std::vector& eigvals, std::mt19937* randgen ) { std::normal_distribution d(0, alpha_std); std::vector alphas(3); for (int i = 0; i < 3; ++i) { alphas[i] = d(*randgen); } std::vector delta_rgb(3, 0.0); for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j]; } } int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { for (int c = 0; c < 3; ++c) { img[p++] += delta_rgb[2 - c]; } } } } // assume HWC order and color channels BGR // mean subtraction and scaling. template void ColorNormalization( float* img, const int img_size, const int channels, const std::vector& mean, const std::vector& std ) { int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { for (int c = 0; c < channels; ++c) { img[p] = (img[p] - mean[c]) * std[c]; p++; } } } } // Factored out image transformation template void TransformImage( const cv::Mat& scaled_img, const int channels, float* image_data, const bool color_jitter, const float saturation, const float brightness, const float contrast, const bool color_lighting, const float color_lighting_std, const std::vector>& color_lighting_eigvecs, const std::vector& color_lighting_eigvals, const int crop, const bool mirror, const std::vector& mean, const std::vector& std, std::mt19937* randgen, std::bernoulli_distribution* mirror_this_image, bool is_test = false) { CAFFE_ENFORCE_GE( scaled_img.rows, crop, "Image height must be bigger than crop."); CAFFE_ENFORCE_GE( scaled_img.cols, crop, "Image width must be bigger than crop."); // find the cropped region, and copy it to the destination matrix int width_offset, height_offset; if (is_test) { width_offset = (scaled_img.cols - crop) / 2; height_offset = (scaled_img.rows - crop) / 2; } else { width_offset = std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); height_offset = std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); } float* image_data_ptr = image_data; if (!is_test && mirror && (*mirror_this_image)(*randgen)) { // Copy mirrored image. for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset + crop - 1; w >= width_offset; --w) { const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; for (int c = 0; c < channels; ++c) { *(image_data_ptr++) = static_cast(cv_data[c]); } } } } else { // Copy normally. for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset; w < width_offset + crop; ++w) { const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; for (int c = 0; c < channels; ++c) { *(image_data_ptr++) = static_cast(cv_data[c]); } } } } if (color_jitter && channels == 3 && !is_test) { ColorJitter(image_data, crop, saturation, brightness, contrast, randgen); } if (color_lighting && channels == 3 && !is_test) { ColorLighting(image_data, crop, color_lighting_std, color_lighting_eigvecs, color_lighting_eigvals, randgen); } // Color normalization // Mean subtraction and scaling. ColorNormalization(image_data, crop, channels, mean, std); } // Only crop / transose the image // leave in uint8_t dataType template void CropTransposeImage(const cv::Mat& scaled_img, const int channels, uint8_t *cropped_data, const int crop, const bool mirror, std::mt19937 *randgen, std::bernoulli_distribution *mirror_this_image, bool is_test = false) { CAFFE_ENFORCE_GE( scaled_img.rows, crop, "Image height must be bigger than crop."); CAFFE_ENFORCE_GE( scaled_img.cols, crop, "Image width must be bigger than crop."); // find the cropped region, and copy it to the destination matrix int width_offset, height_offset; if (is_test) { width_offset = (scaled_img.cols - crop) / 2; height_offset = (scaled_img.rows - crop) / 2; } else { width_offset = std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); height_offset = std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); } if (mirror && (*mirror_this_image)(*randgen)) { // Copy mirrored image. for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset + crop - 1; w >= width_offset; --w) { const uint8_t* cv_data = scaled_img.ptr(h) + w*channels; for (int c = 0; c < channels; ++c) { *(cropped_data++) = cv_data[c]; } } } } else { // Copy normally. for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset; w < width_offset + crop; ++w) { const uint8_t* cv_data = scaled_img.ptr(h) + w*channels; for (int c = 0; c < channels; ++c) { *(cropped_data++) = cv_data[c]; } } } } } // Parse datum, decode image, perform transform // Intended as entry point for binding to thread pool template void ImageInputOp::DecodeAndTransform( const std::string& value, float *image_data, int item_id, const int channels, std::size_t thread_index) { CAFFE_ENFORCE((int)thread_index < num_decode_threads_); std::bernoulli_distribution mirror_this_image(0.5f); std::mt19937* randgen = &(randgen_per_thread_[thread_index]); cv::Mat img; // Decode the image PerImageArg info; CHECK(GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen)); // Factor out the image transformation TransformImage(img, channels, image_data, color_jitter_, img_saturation_, img_brightness_, img_contrast_, color_lighting_, color_lighting_std_, color_lighting_eigvecs_, color_lighting_eigvals_, crop_, mirror_, mean_, std_, randgen, &mirror_this_image, is_test_); } template void ImageInputOp::DecodeAndTransposeOnly( const std::string& value, uint8_t *image_data, int item_id, const int channels, std::size_t thread_index) { CAFFE_ENFORCE((int)thread_index < num_decode_threads_); std::bernoulli_distribution mirror_this_image(0.5f); std::mt19937* randgen = &(randgen_per_thread_[thread_index]); cv::Mat img; // Decode the image PerImageArg info; CHECK(GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen)); // Factor out the image transformation CropTransposeImage(img, channels, image_data, crop_, mirror_, randgen, &mirror_this_image, is_test_); } template bool ImageInputOp::Prefetch() { if (!owned_reader_.get()) { // if we are not owning the reader, we will get the reader pointer from // input. Otherwise the constructor should have already set the reader // pointer. reader_ = &OperatorBase::Input(0); } const int channels = color_ ? 3 : 1; // Call mutable_data() once to allocate the underlying memory. if (gpu_transform_) { // we'll transfer up in int8, then convert later prefetched_image_.mutable_data(); } else { prefetched_image_.mutable_data(); } prefetched_label_.mutable_data(); // Prefetching handled with a thread pool of "decode_threads" threads. for (int item_id = 0; item_id < batch_size_; ++item_id) { std::string key, value; cv::Mat img; // read data reader_->Read(&key, &value); // determine label type based on first item if( item_id == 0 ) { if( use_caffe_datum_ ) { prefetched_label_.mutable_data(); } else { TensorProtos protos; CAFFE_ENFORCE(protos.ParseFromString(value)); TensorProto_DataType labeldt = protos.protos(1).data_type(); if( labeldt == TensorProto::INT32 ) { prefetched_label_.mutable_data(); } else if ( labeldt == TensorProto::FLOAT) { prefetched_label_.mutable_data(); } else { LOG(FATAL) << "Unsupported label type."; } for (int i = 0; i < additional_inputs_count_; ++i) { int index = additional_inputs_offset_ + i; TensorProto additional_output_proto = protos.protos(index); if (additional_output_proto.data_type() == TensorProto::FLOAT) { prefetched_additional_outputs_[i].template mutable_data(); } else if ( additional_output_proto.data_type() == TensorProto::INT32) { prefetched_additional_outputs_[i].template mutable_data(); } else if ( additional_output_proto.data_type() == TensorProto::INT64) { prefetched_additional_outputs_[i].template mutable_data(); } else { LOG(FATAL) << "Unsupported output type."; } } } } // launch into thread pool for processing // TODO: support color jitter and color lighting in gpu_transform if (gpu_transform_) { // output of decode will still be int8 uint8_t* image_data = prefetched_image_.mutable_data() + crop_ * crop_ * channels * item_id; thread_pool_->runTaskWithID(std::bind( &ImageInputOp::DecodeAndTransposeOnly, this, std::string(value), image_data, item_id, channels, std::placeholders::_1)); } else { float* image_data = prefetched_image_.mutable_data() + crop_ * crop_ * channels * item_id; thread_pool_->runTaskWithID(std::bind( &ImageInputOp::DecodeAndTransform, this, std::string(value), image_data, item_id, channels, std::placeholders::_1)); } } thread_pool_->waitWorkComplete(); // If the context is not CPUContext, we will need to do a copy in the // prefetch function as well. if (!std::is_same::value) { prefetched_image_on_device_.CopyFrom(prefetched_image_, &context_); prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_); for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) { prefetched_additional_outputs_on_device_[i].CopyFrom( prefetched_additional_outputs_[i], &context_); } } return true; } template bool ImageInputOp::CopyPrefetched() { auto* image_output = OperatorBase::Output >(0); auto* label_output = OperatorBase::Output >(1); vector*> additional_outputs_output; for (int i = 2; i < OutputSize(); ++i) { additional_outputs_output.push_back( OperatorBase::Output>(i)); } // Note(jiayq): The if statement below should be optimized away by the // compiler since std::is_same is a constexpr. if (std::is_same::value) { image_output->CopyFrom(prefetched_image_, &context_); label_output->CopyFrom(prefetched_label_, &context_); for (int i = 0; i < additional_outputs_output.size(); ++i) { additional_outputs_output[i]->CopyFrom( prefetched_additional_outputs_[i], &context_); } } else { // TODO: support color jitter and color lighting in gpu_transform if (gpu_transform_) { if (!mean_std_copied_) { mean_gpu_.Resize(mean_.size()); std_gpu_.Resize(std_.size()); context_.template Copy( mean_.size(), mean_.data(), mean_gpu_.template mutable_data()); context_.template Copy( std_.size(), std_.data(), std_gpu_.template mutable_data()); mean_std_copied_ = true; } // GPU transform kernel allows explicitly setting output type if (output_type_ == TensorProto_DataType_FLOAT) { TransformOnGPU(prefetched_image_on_device_, image_output, mean_gpu_, std_gpu_, &context_); } else if (output_type_ == TensorProto_DataType_FLOAT16) { TransformOnGPU(prefetched_image_on_device_, image_output, mean_gpu_, std_gpu_, &context_); } else { return false; } } else { image_output->CopyFrom(prefetched_image_on_device_, &context_); } label_output->CopyFrom(prefetched_label_on_device_, &context_); for (int i = 0; i < additional_outputs_output.size(); ++i) { additional_outputs_output[i]->CopyFrom( prefetched_additional_outputs_on_device_[i], &context_); } } return true; } } // namespace caffe2 #endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_