10 files changed, 1399 insertions, 550 deletions
diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc
new file mode 100644
index 0000000000..1cc4103a53
--- /dev/null
+++ b/caffe2/operators/roi_align_gradient_op.cc
@@ -0,0 +1,255 @@
+#include "roi_align_gradient_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int /*index*/ /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(const T& val, T* address) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int /*num_rois*/,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois,
+    int rois_cols) {
+  DCHECK(rois_cols == 4 || rois_cols == 5);
+
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * rois_cols;
+    int roi_batch_ind = 0;
+    if (rois_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignBackward
+
+} // namespace
+
+template <>
+bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
+                       // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  CAFFE_ENFORCE_EQ(R.ndim(), 2);
+  // if R has 5 columns, the first column is the index, otherwise 0
+  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
+
+  dX->ResizeLike(X);
+
+  // Must zero-out dX before accumulating gradients
+  // (TODO): Kaiming - is this safe?
+  math::Set<float, CPUContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+
+  if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
+    ROIAlignBackwardFeature<float>(
+        dY.size(),
+        dY.data<float>(),
+        R.dim32(0),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        dX->mutable_data<float>(),
+        R.data<float>(),
+        R.dim32(1));
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RoIAlignGradient, RoIAlignGradientOp<float, CPUContext>);
+
+// Input: X, rois, dY (aka "gradOutput");
+// Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(RoIAlignGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(0, "X", "See RoIPoolF.")
+    .Input(1, "RoIs", "See RoIPoolF.")
+    .Input(2, "dY", "Gradient of forward output 0 (Y)")
+    .Output(0, "dX", "Gradient of forward input 0 (X)");
+
+namespace {
+
+class GetRoIAlignGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "RoIAlignGradient",
+        "",
+        vector<string>{I(0), I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(RoIAlign, GetRoIAlignGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
new file mode 100644
index 0000000000..702b8c7102
--- /dev/null
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -0,0 +1,231 @@
+#include "roi_align_gradient_op.h"
+
+#include <stdio.h>
+#include <cfloat>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__ float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          gpu_atomic_add(
+              static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          gpu_atomic_add(
+              static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+} // namespace
+
+template <>
+bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
+                       // (aka "gradOutput")
+  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
+                        // (aka "gradInput")
+
+  dX->ResizeLike(X);
+
+  // Must zero-out dX before accumulating gradients
+  // (TODO): Kaiming - is this safe?
+  math::Set<float, CUDAContext>(
+      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
+
+  if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois
+    RoIAlignBackwardFeature<float>
+        <<<CAFFE_GET_BLOCKS(dY.size()),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            dY.size(),
+            dY.data<float>(),
+            R.dim32(0),
+            spatial_scale_,
+            X.dim32(1),
+            X.dim32(2),
+            X.dim32(3),
+            pooled_height_,
+            pooled_width_,
+            sampling_ratio_,
+            dX->mutable_data<float>(),
+            R.data<float>());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    RoIAlignGradient,
+    RoIAlignGradientOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_gradient_op.h b/caffe2/operators/roi_align_gradient_op.h
new file mode 100644
index 0000000000..509825fbbf
--- /dev/null
+++ b/caffe2/operators/roi_align_gradient_op.h
@@ -0,0 +1,43 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef ROI_ALIGN_OP_H_
+#define ROI_ALIGN_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIAlignGradientOp final : public Operator<Context> {
+ public:
+  RoIAlignGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+    DCHECK_GE(sampling_ratio_, 0);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_ALIGN_OP_H_
diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc
new file mode 100644
index 0000000000..8f5d12ea2b
--- /dev/null
+++ b/caffe2/operators/roi_align_op.cc
@@ -0,0 +1,376 @@
+#include "roi_align_op.h"
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+#ifdef CAFFE2_USE_MKL
+#include "caffe2/mkl/operators/operator_fallback_mkl.h"
+#endif // CAFFE2_USE_MKL
+
+namespace caffe2 {
+namespace {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    int roi_cols,
+    T* top_data,
+    StorageOrder order) {
+  DCHECK(roi_cols == 4 || roi_cols == 5);
+
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+    if (order == StorageOrder::NCHW) {
+      for (int c = 0; c < channels; c++) {
+        int index_n_c = index_n + c * pooled_width * pooled_height;
+        const T* offset_bottom_data =
+            bottom_data + (roi_batch_ind * channels + c) * height * width;
+        int pre_calc_index = 0;
+
+        for (int ph = 0; ph < pooled_height; ph++) {
+          for (int pw = 0; pw < pooled_width; pw++) {
+            int index = index_n_c + ph * pooled_width + pw;
+
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                PreCalc<T> pc = pre_calc[pre_calc_index];
+                output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                    pc.w2 * offset_bottom_data[pc.pos2] +
+                    pc.w3 * offset_bottom_data[pc.pos3] +
+                    pc.w4 * offset_bottom_data[pc.pos4];
+
+                pre_calc_index += 1;
+              }
+            }
+            output_val /= count;
+
+            top_data[index] = output_val;
+          } // for pw
+        } // for ph
+      } // for c
+    } // if nchw
+
+    if (order == StorageOrder::NHWC) {
+      const T* offset_bottom_data =
+          bottom_data + roi_batch_ind * channels * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          EVecXf output_vals = EVecXf::Zero(channels);
+
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+
+              ConstEigenVectorMap<T> data_1(
+                  offset_bottom_data + channels * pc.pos1, channels);
+              ConstEigenVectorMap<T> data_2(
+                  offset_bottom_data + channels * pc.pos2, channels);
+              ConstEigenVectorMap<T> data_3(
+                  offset_bottom_data + channels * pc.pos3, channels);
+              ConstEigenVectorMap<T> data_4(
+                  offset_bottom_data + channels * pc.pos4, channels);
+
+              output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 +
+                  pc.w4 * data_4;
+
+              pre_calc_index += 1;
+            }
+          }
+          output_vals /= count;
+
+          int index_nhw = index_n + (ph * pooled_width + pw) * channels;
+          std::memcpy(
+              top_data + index_nhw, output_vals.data(), channels * sizeof(T));
+        } // for pw
+      } // for ph
+    } // if nhwc
+
+  } // for n
+}
+
+} // namespace
+
+template <>
+bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool, NCHW
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    if (order_ == StorageOrder::NCHW) {
+      Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    } else if (order_ == StorageOrder::NHWC) {
+      Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
+    }
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    return true;
+  }
+
+  CAFFE_ENFORCE_EQ(R.ndim(), 2);
+  // if R has 5 columns, the first column is the index, otherwise 0
+  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
+
+  assert(sampling_ratio_ >= 0);
+
+  if (order_ == StorageOrder::NCHW) {
+    Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+    int output_size = Y->size();
+    ROIAlignForward<float>(
+        output_size,
+        X.data<float>(),
+        spatial_scale_,
+        X.dim32(1),
+        X.dim32(2),
+        X.dim32(3),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        R.data<float>(),
+        R.dim32(1),
+        Y->mutable_data<float>(),
+        order_);
+  } else if (order_ == StorageOrder::NHWC) {
+    Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
+    int output_size = Y->size();
+    ROIAlignForward<float>(
+        output_size,
+        X.data<float>(),
+        spatial_scale_,
+        X.dim32(3),
+        X.dim32(1),
+        X.dim32(2),
+        pooled_height_,
+        pooled_width_,
+        sampling_ratio_,
+        R.data<float>(),
+        R.dim32(1),
+        Y->mutable_data<float>(),
+        order_);
+  }
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);
+
+#ifdef CAFFE2_HAS_MKL_DNN
+REGISTER_MKL_OPERATOR(
+    RoIAlign,
+    mkl::MKLFallbackOp<RoIAlignOp<float, CPUContext>>);
+#endif // CAFFE2_HAS_MKL_DNN
+
+// Input: X, rois; Output: Y
+OPERATOR_SCHEMA(RoIAlign)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Region of Interest (RoI) align operation as used in Mask R-CNN.
+)DOC")
+    .Arg(
+        "spatial_scale",
+        "(float) default 1.0; Spatial scale of the input feature map X "
+        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
+        "w.r.t. the input image.")
+    .Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
+    .Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
+    .Arg(
+        "sampling_ratio",
+        "(int) default -1; number of sampling points in the interpolation grid "
+        "used to compute the output value of each pooled output bin. If > 0, "
+        "then exactly sampling_ratio x sampling_ratio grid points are used. If "
+        "<= 0, then an adaptive number of grid points are used (computed as "
+        "ceil(roi_width / pooled_w), and likewise for height).")
+    .Input(0, "X", "4D feature map input of shape (N, C, H, W).")
+    .Input(
+        1,
+        "RoIs",
+        "2D input of shape (R, 5) specifying R RoIs with five columns "
+        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
+        "coordinates are in the coordinate system of the input image.")
+    .Output(
+        0,
+        "Y",
+        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
+        "is a pooled feature map cooresponding to the r-th RoI.");
+
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
new file mode 100644
index 0000000000..29676f31d6
--- /dev/null
+++ b/caffe2/operators/roi_align_op.cu
@@ -0,0 +1,181 @@
+#include "roi_align_op.h"
+
+#include <stdio.h>
+#include <cfloat>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+} // namespace
+
+template <>
+bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0); // Input data to pool
+  auto& R = Input(1); // RoIs
+  auto* Y = Output(0); // RoI pooled data
+
+  if (R.size() == 0) {
+    // Handle empty rois
+    Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
+    // The following mutable_data calls are needed to allocate the tensors
+    Y->mutable_data<float>();
+    return true;
+  }
+
+  assert(sampling_ratio_ >= 0);
+
+  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
+  int output_size = Y->size();
+  RoIAlignForward<float>
+      <<<CAFFE_GET_BLOCKS(output_size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          output_size,
+          X.data<float>(),
+          spatial_scale_,
+          X.dim32(1),
+          X.dim32(2),
+          X.dim32(3),
+          pooled_height_,
+          pooled_width_,
+          sampling_ratio_,
+          R.data<float>(),
+          Y->mutable_data<float>());
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(RoIAlign, RoIAlignOp<float, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/roi_align_op.h b/caffe2/operators/roi_align_op.h
new file mode 100644
index 0000000000..fc6f67c392
--- /dev/null
+++ b/caffe2/operators/roi_align_op.h
@@ -0,0 +1,47 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef ROI_ALIGN_OP_H_
+#define ROI_ALIGN_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class RoIAlignOp final : public Operator<Context> {
+ public:
+  RoIAlignOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<string>("order", "NCHW"))),
+        spatial_scale_(
+            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
+        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
+        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
+        sampling_ratio_(
+            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
+    DCHECK_GT(spatial_scale_, 0);
+    DCHECK_GT(pooled_height_, 0);
+    DCHECK_GT(pooled_width_, 0);
+    DCHECK_GE(sampling_ratio_, 0);
+    DCHECK(order_ == StorageOrder::NCHW || order_ == StorageOrder::NHWC);
+  }
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
+ protected:
+  StorageOrder order_;
+  float spatial_scale_;
+  int pooled_height_;
+  int pooled_width_;
+  int sampling_ratio_;
+};
+
+} // namespace caffe2
+
+#endif // ROI_ALIGN_OP_H_
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
new file mode 100644
index 0000000000..b72738b640
--- /dev/null
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -0,0 +1,266 @@
+#include "caffe2/utils/eigen_utils.h"
+#include "roi_align_op.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/utils/math.h"
+#include "gtest/gtest.h"
+
+namespace caffe2 {
+namespace {
+
+template <class Context>
+void AddConstInput(
+    const vector<TIndex>& shape,
+    const float value,
+    const string& name,
+    Context* context,
+    Workspace* ws) {
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<Tensor<Context>>();
+  tensor->Resize(shape);
+  math::Set<float, Context>(
+      tensor->size(), value, tensor->template mutable_data<float>(), context);
+  return;
+}
+
+template <class Context>
+void AddInput(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws);
+
+template <>
+void AddInput<CPUContext>(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->GetMutable<TensorCPU>();
+  tensor->Resize(shape);
+  EigenVectorMap<float> tensor_vec(
+      tensor->mutable_data<float>(), tensor->size());
+  tensor_vec.array() = utils::AsEArrXt(values);
+}
+
+template <>
+void AddInput<CUDAContext>(
+    const vector<TIndex>& shape,
+    const vector<float>& values,
+    const string& name,
+    Workspace* ws) {
+  TensorCPU tmp(shape);
+  EigenVectorMap<float> tmp_vec(tmp.mutable_data<float>(), tmp.size());
+  tmp_vec.array() = utils::AsEArrXt(values);
+
+  Blob* blob = ws->CreateBlob(name);
+  auto* tensor = blob->template GetMutable<Tensor<CUDAContext>>();
+  tensor->CopyFrom(tmp);
+}
+
+template <class Context>
+DeviceType GetDeviceType() {
+  return CPU;
+}
+template <>
+DeviceType GetDeviceType<CUDAContext>() {
+  return CUDA;
+}
+
+int randInt(int a, int b) {
+  static std::random_device rd;
+  static std::mt19937 gen(rd());
+  return std::uniform_int_distribution<int>(a, b)(gen);
+}
+
+struct TestParams {
+  int N;
+  int C;
+  int H;
+  int W;
+  int n_rois;
+  vector<float> rois_array;
+};
+
+template <class Context>
+void CreateAndRun(
+    TensorCPU* outResult,
+    const string& order,
+    const TestParams& test_params,
+    bool random_test) {
+  Workspace ws;
+  Context context;
+
+  if (random_test) {
+    const int N = test_params.N;
+    const int C = test_params.C;
+    const int H = test_params.H;
+    const int W = test_params.W;
+    vector<float> features(N * C * H * W);
+    std::iota(features.begin(), features.end(), 0);
+    // utils::AsEArrXt(features) /= features.size();
+    AddInput<Context>(vector<TIndex>{N, C, H, W}, features, "X", &ws);
+    const int n_rois = test_params.n_rois;
+    const vector<float>& rois = test_params.rois_array;
+    AddInput<Context>(vector<TIndex>{n_rois, 5}, rois, "R", &ws);
+  } else {
+    const int N = 2;
+    const int C = 3;
+    const int H = 100;
+    const int W = 110;
+    vector<float> features(N * C * H * W);
+    std::iota(features.begin(), features.end(), 0);
+    // utils::AsEArrXt(features) /= features.size();
+    AddInput<Context>(vector<TIndex>{N, C, H, W}, features, "X", &ws);
+    vector<float> rois{0, 0,           0,           79,          59,
+                       0, 0,           5.0005703,   52.63237,    43.69501495,
+                       0, 24.13628387, 7.51243401,  79,          46.06628418,
+                       0, 0,           7.50924301,  68.47792816, 46.03357315,
+                       0, 0,           23.09477997, 51.61448669, 59,
+                       0, 0,           39.52141571, 52.44710541, 59,
+                       0, 23.57396317, 29.98791885, 79,          59,
+                       0, 0,           41.90219116, 79,          59,
+                       0, 0,           23.30098343, 79,          59};
+    AddInput<Context>(vector<TIndex>{9, 5}, rois, "R", &ws);
+  }
+
+  std::vector<unique_ptr<OperatorBase>> ops;
+  EXPECT_TRUE(order == "NCHW" || order == "NHWC");
+  if (order == "NCHW") {
+    OperatorDef def;
+    def.set_name("test");
+    def.set_type("RoIAlign");
+    def.add_input("X");
+    def.add_input("R");
+    def.add_output("Y");
+    def.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+    def.add_arg()->CopyFrom(MakeArgument("spatial_scale", 1.0f / 16.0f));
+    def.add_arg()->CopyFrom(MakeArgument("pooled_h", 6));
+    def.add_arg()->CopyFrom(MakeArgument("pooled_w", 8));
+    def.add_arg()->CopyFrom(MakeArgument("sampling_ratio", 2));
+
+    ops.push_back(CreateOperator(def, &ws));
+  } else if (order == "NHWC") {
+    OperatorDef def_roialign;
+    def_roialign.set_name("test");
+    def_roialign.set_type("RoIAlign");
+    def_roialign.add_input("X_NHWC");
+    def_roialign.add_input("R");
+    def_roialign.add_output("Y_NHWC");
+    def_roialign.mutable_device_option()->set_device_type(
+        GetDeviceType<Context>());
+    def_roialign.add_arg()->CopyFrom(
+        MakeArgument("spatial_scale", 1.0f / 16.0f));
+    def_roialign.add_arg()->CopyFrom(MakeArgument("pooled_h", 6));
+    def_roialign.add_arg()->CopyFrom(MakeArgument("pooled_w", 8));
+    def_roialign.add_arg()->CopyFrom(MakeArgument("sampling_ratio", 2));
+    def_roialign.add_arg()->CopyFrom(MakeArgument<string>("order", "NHWC"));
+
+    OperatorDef def_x;
+    def_x.set_name("test_x");
+    def_x.set_type("NCHW2NHWC");
+    def_x.add_input("X");
+    def_x.add_output("X_NHWC");
+    def_x.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+
+    OperatorDef def_y;
+    def_y.set_name("test_y");
+    def_y.set_type("NHWC2NCHW");
+    def_y.add_input("Y_NHWC");
+    def_y.add_output("Y");
+    def_y.mutable_device_option()->set_device_type(GetDeviceType<Context>());
+
+    ops.push_back(CreateOperator(def_x, &ws));
+    ops.push_back(CreateOperator(def_roialign, &ws));
+    ops.push_back(CreateOperator(def_y, &ws));
+  }
+
+  for (auto const& op : ops) {
+    EXPECT_NE(nullptr, op.get());
+    EXPECT_TRUE(op->Run());
+  }
+
+  Blob* Y_blob = ws.GetBlob("Y");
+  EXPECT_NE(nullptr, Y_blob);
+
+  auto& Y = Y_blob->Get<Tensor<Context>>();
+  outResult->CopyFrom(Y, &context);
+}
+
+} // namespace
+
+TEST(RoiAlignTest, CheckCPUGPUEqual) {
+  if (!caffe2::HasCudaGPU())
+    return;
+
+  TensorCPU y_cpu;
+  TensorCPU y_gpu;
+  TensorCPU y_cpu_nhwc;
+
+  // tests using FAIR example
+  {
+    TestParams test_params;
+    CreateAndRun<CPUContext>(&y_cpu, "NCHW", test_params, false);
+    CreateAndRun<CUDAContext>(&y_gpu, "NCHW", test_params, false);
+    CreateAndRun<CPUContext>(&y_cpu_nhwc, "NHWC", test_params, false);
+
+    EXPECT_EQ(y_cpu.dims(), y_gpu.dims());
+    EXPECT_EQ(y_cpu.dims(), y_cpu_nhwc.dims());
+    ConstEigenVectorMap<float> y_cpu_vec(y_cpu.data<float>(), y_cpu.size());
+    ConstEigenVectorMap<float> y_gpu_vec(y_gpu.data<float>(), y_gpu.size());
+    ConstEigenVectorMap<float> y_cpu_nhwc_vec(
+        y_cpu_nhwc.data<float>(), y_cpu_nhwc.size());
+    int max_diff_idx = -1;
+    (y_cpu_vec - y_gpu_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_gpu_vec[max_diff_idx]);
+
+    max_diff_idx = -1;
+    (y_cpu_vec - y_cpu_nhwc_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_cpu_nhwc_vec[max_diff_idx]);
+  }
+
+  // random tests
+  const int random_test_numbers = 100;
+  for (int i = 0; i < random_test_numbers; i++) {
+    const int N = randInt(1, 5);
+    const int C = randInt(1, 5);
+    const int H = randInt(1, 50);
+    const int W = randInt(1, 50);
+    const int n_rois = randInt(0, 30);
+    vector<float> rois_array;
+    for (int n = 0; n < n_rois; n++) {
+      rois_array.push_back(randInt(0, N - 1));
+      int w1 = randInt(-20, W + 20);
+      int w2 = randInt(-20, W + 20);
+      int h1 = randInt(-20, H + 20);
+      int h2 = randInt(-20, H + 20);
+      rois_array.push_back(std::min(w1, w2));
+      rois_array.push_back(std::max(h1, h2));
+      rois_array.push_back(std::min(w1, w2));
+      rois_array.push_back(std::max(h1, h2));
+    }
+    TestParams test_params{N, C, H, W, n_rois, rois_array};
+
+    CreateAndRun<CPUContext>(&y_cpu, "NCHW", test_params, true);
+    CreateAndRun<CUDAContext>(&y_gpu, "NCHW", test_params, true);
+    CreateAndRun<CPUContext>(&y_cpu_nhwc, "NHWC", test_params, true);
+
+    EXPECT_EQ(y_cpu.dims(), y_gpu.dims());
+    EXPECT_EQ(y_cpu.dims(), y_cpu_nhwc.dims());
+    ConstEigenVectorMap<float> y_cpu_vec(y_cpu.data<float>(), y_cpu.size());
+    ConstEigenVectorMap<float> y_gpu_vec(y_gpu.data<float>(), y_gpu.size());
+    ConstEigenVectorMap<float> y_cpu_nhwc_vec(
+        y_cpu_nhwc.data<float>(), y_cpu_nhwc.size());
+    int max_diff_idx = -1;
+    (y_cpu_vec - y_gpu_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_gpu_vec[max_diff_idx]);
+
+    max_diff_idx = -1;
+    (y_cpu_vec - y_cpu_nhwc_vec).cwiseAbs().maxCoeff(&max_diff_idx);
+    EXPECT_FLOAT_EQ(y_cpu_vec[max_diff_idx], y_cpu_nhwc_vec[max_diff_idx]);
+  }
+}
+
+} // namespace caffe2
diff --git a/modules/detectron/roi_align_op.cc b/modules/detectron/roi_align_op.cc
deleted file mode 100644
index 38094ff210..0000000000
--- a/modules/detectron/roi_align_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "roi_align_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(RoIAlignGradient, RoIAlignGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(RoIAlign)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Region of Interest (RoI) align operation as used in Mask R-CNN.
-)DOC")
-    .Arg(
-        "spatial_scale",
-        "(float) default 1.0; Spatial scale of the input feature map X "
-        "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
-        "w.r.t. the input image.")
-    .Arg(
-        "pooled_h",
-        "(int) default 1; Pooled output Y's height.")
-    .Arg(
-        "pooled_w",
-        "(int) default 1; Pooled output Y's width.")
-    .Arg(
-        "sampling_ratio",
-        "(int) default -1; number of sampling points in the interpolation grid "
-        "used to compute the output value of each pooled output bin. If > 0, "
-        "then exactly sampling_ratio x sampling_ratio grid points are used. If "
-        "<= 0, then an adaptive number of grid points are used (computed as "
-        "ceil(roi_width / pooled_w), and likewise for height)."
-    )
-    .Input(
-        0,
-        "X",
-        "4D feature map input of shape (N, C, H, W).")
-    .Input(
-        1,
-        "RoIs",
-        "2D input of shape (R, 5) specifying R RoIs with five columns "
-        "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
-        "coordinates are in the coordinate system of the input image.")
-    .Output(
-        0,
-        "Y",
-        "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
-        "is a pooled feature map cooresponding to the r-th RoI.");
-
-OPERATOR_SCHEMA(RoIAlignGradient)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .Input(
-        0,
-        "X",
-        "See RoIPoolF.")
-    .Input(
-        1,
-        "RoIs",
-        "See RoIPoolF.")
-    .Input(
-        2,
-        "dY",
-        "Gradient of forward output 0 (Y)")
-    .Output(
-        0,
-        "dX",
-        "Gradient of forward input 0 (X)");
-
-class GetRoIAlignGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "RoIAlignGradient",
-        "",
-        vector<string>{I(0), I(1), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
-REGISTER_GRADIENT(RoIAlign, GetRoIAlignGradient);
-
-} // namespace caffe2
diff --git a/modules/detectron/roi_align_op.cu b/modules/detectron/roi_align_op.cu
deleted file mode 100644
index 01d67f63b0..0000000000
--- a/modules/detectron/roi_align_op.cu
+++ /dev/null
@@ -1,363 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// RoIAlign for Mask R-CNN
-// This is the per-cell centered algined versio of RoIAlign.
-// This is the official version.
-
-#include <cfloat>
-
-#include "caffe2/core/context_gpu.h"
-#include "roi_align_op.h"
-
-#include <stdio.h>
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ T gpu_atomic_add(const T val, T* address);
-
-template <>
-inline __device__
-float gpu_atomic_add(const float val, float* address) {
-  return atomicAdd(address, val);
-}
-
-template <typename T>
-__device__ T bilinear_interpolate(const T* bottom_data,
-    const int height, const int width,
-    T y, T x,
-    const int index /* index for debug only*/) {
-
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    //empty
-    return 0;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int) y;
-  int x_low = (int) x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T) y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T) x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  // do bilinear interpolation
-  T v1 = bottom_data[y_low * width + x_low];
-  T v2 = bottom_data[y_low * width + x_high];
-  T v3 = bottom_data[y_high * width + x_low];
-  T v4 = bottom_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  return val;
-}
-
-template <typename T>
-__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
-    const T spatial_scale, const int channels,
-    const int height, const int width,
-    const int pooled_height, const int pooled_width,
-    const int sampling_ratio,
-    const T* bottom_rois, T* top_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
-    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
-
-    T output_val = 0.;
-    for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
-    {
-      const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix ++)
-      {
-        const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
-
-        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-
-    top_data[index] = output_val;
-  }
-}
-
-template <typename T>
-__device__ void bilinear_interpolate_gradient(
-    const int height, const int width,
-    T y, T x,
-    T & w1, T & w2, T & w3, T & w4,
-    int & x_low, int & x_high, int & y_low, int & y_high,
-    const int index /* index for debug only*/) {
-
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    //empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int) y;
-  x_low = (int) x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T) y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T) x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = bottom_data[y_low * width + x_low];
-  // T v2 = bottom_data[y_low * width + x_high];
-  // T v3 = bottom_data[y_high * width + x_low];
-  // T v4 = bottom_data[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <typename T>
-__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
-    const int num_rois, const T spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width,
-    const int sampling_ratio,
-    T* bottom_diff,
-    const T* bottom_rois) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
-
-    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_top_diff = top_diff + top_offset;
-    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
-    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
-    {
-      const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix ++)
-      {
-        const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
-
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient(height, width, y, x,
-            w1, w2, w3, w4,
-            x_low, x_high, y_low, y_high,
-            index);
-
-        T g1 = top_diff_this_bin * w1 / count;
-        T g2 = top_diff_this_bin * w2 / count;
-        T g3 = top_diff_this_bin * w3 / count;
-        T g4 = top_diff_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
-        {
-          gpu_atomic_add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
-          gpu_atomic_add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
-          gpu_atomic_add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
-          gpu_atomic_add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
-        } // if
-      } // ix
-    } // iy
-  } // CUDA_1D_KERNEL_LOOP
-} // RoIAlignBackward
-
-
-} // namespace
-
-template<>
-bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);  // Input data to pool
-  auto& R = Input(1);  // RoIs
-  auto* Y = Output(0); // RoI pooled data
-
-  if (R.size() == 0) {
-    // Handle empty rois
-    Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
-    // The following mutable_data calls are needed to allocate the tensors
-    Y->mutable_data<float>();
-    return true;
-  }
-
-  assert(sampling_ratio_ >= 0);
-
-  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
-  int output_size = Y->size();
-  RoIAlignForward<float>
-      <<<CAFFE_GET_BLOCKS(output_size),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          output_size,
-          X.data<float>(),
-          spatial_scale_,
-          X.dim32(1),
-          X.dim32(2),
-          X.dim32(3),
-          pooled_height_,
-          pooled_width_,
-          sampling_ratio_,
-          R.data<float>(),
-          Y->mutable_data<float>());
-  return true;
-}
-
-template<>
-bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
-  auto& X  = Input(0);  // Input data to pool
-  auto& R  = Input(1);  // RoIs
-  auto& dY = Input(2);  // Gradient of net w.r.t. output of "forward" op
-                        // (aka "gradOutput")
-  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
-                        // (aka "gradInput")
-
-  dX->ResizeLike(X);
-
-  // Must zero-out dX before accumulating gradients
-  math::Set<float, CUDAContext>(
-      dX->size(), 0.f, dX->mutable_data<float>(), &context_);
-
-  if (dY.size() > 0) {  // Handle possibly empty gradient if there were no rois
-    RoIAlignBackwardFeature<float>
-        <<<CAFFE_GET_BLOCKS(dY.size()),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           context_.cuda_stream()>>>(
-            dY.size(),
-            dY.data<float>(),
-            R.dim32(0),
-            spatial_scale_,
-            X.dim32(1),
-            X.dim32(2),
-            X.dim32(3),
-            pooled_height_,
-            pooled_width_,
-            sampling_ratio_,
-            dX->mutable_data<float>(),
-            R.data<float>());
-  }
-  return true;
-}
-
-
-REGISTER_CUDA_OPERATOR(RoIAlign,
-                       RoIAlignOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(RoIAlignGradient,
-                       RoIAlignGradientOp<float, CUDAContext>);
-} // namespace caffe2
diff --git a/modules/detectron/roi_align_op.h b/modules/detectron/roi_align_op.h
deleted file mode 100644
index 8283d4b242..0000000000
--- a/modules/detectron/roi_align_op.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ROI_ALIGN_OP_H_
-#define ROI_ALIGN_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class RoIAlignOp final : public Operator<Context> {
- public:
-  RoIAlignOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        spatial_scale_(
-            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
-        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
-        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
-        sampling_ratio_(
-            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
-    DCHECK_GT(spatial_scale_, 0);
-    DCHECK_GT(pooled_height_, 0);
-    DCHECK_GT(pooled_width_, 0);
-    DCHECK_GE(sampling_ratio_, 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int pooled_height_;
-  int pooled_width_;
-  int sampling_ratio_;
-};
-
-template <typename T, class Context>
-class RoIAlignGradientOp final : public Operator<Context> {
- public:
-  RoIAlignGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        spatial_scale_(
-            OperatorBase::GetSingleArgument<float>("spatial_scale", 1.)),
-        pooled_height_(OperatorBase::GetSingleArgument<int>("pooled_h", 1)),
-        pooled_width_(OperatorBase::GetSingleArgument<int>("pooled_w", 1)),
-        sampling_ratio_(
-            OperatorBase::GetSingleArgument<int>("sampling_ratio", -1)) {
-    DCHECK_GT(spatial_scale_, 0);
-    DCHECK_GT(pooled_height_, 0);
-    DCHECK_GT(pooled_width_, 0);
-    DCHECK_GE(sampling_ratio_, 0);
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    // No CPU implementation for now
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
- protected:
-  float spatial_scale_;
-  int pooled_height_;
-  int pooled_width_;
-  int sampling_ratio_;
-};
-
-} // namespace caffe2
-
-#endif // ROI_ALIGN_OP_H_