HIP Operators Generator--> HipOpG (#9322)

Summary: The goal of this PR is to add an infrastructure; to convert(hipify) CUDA ops into [HIP](https://github.com/ROCm-Developer-Tools/HIP) ops , at **compile** time. Note that HIP ops, which are portable c++ code, can run on AMD and NVIDIA platform. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9322 Differential Revision: D8884707 Pulled By: bddppq fbshipit-source-id: dabc6319546002c308c10528238e6684f7aef0f8
author: Peter Yeh <pyeh@amd.com> 2018-07-19 00:11:20 -0700
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-07-19 00:26:06 -0700
commit: 54db14e390f2d32e1212870b872c96fbca71c48a (patch)
tree: 3c1430b42ce4800527e4ddec45cbf66cb3060fbc /caffe2/operators/hip
parent: 45f0d05202f0506f39dad7e5cc537b5851dbb298 (diff)
download: pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.gz
pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.bz2
pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.zip
2 files changed, 0 insertions, 194 deletions
diff --git a/caffe2/operators/hip/operator_fallback_hip.h b/caffe2/operators/hip/operator_fallback_hip.h
deleted file mode 100644
index 62e5fe8f01..0000000000
--- a/caffe2/operators/hip/operator_fallback_hip.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
-#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/hip/context_hip.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/proto/caffe2.pb.h"
-
-namespace caffe2 {
-
-/**
- * @brief A templated class to allow one to wrap a CPU operator as a CUDA
- * operator.
- *
- * This class can be used when one does not have the CUDA implementation ready
- * yet for an operator. Essentially, what this op does is to automatically
- * deal with data copy for you. Plausibly, this causes a lot of overhead and
- * is not optimal, so you should use this operator mostly for quick prototyping
- * purpose.
- *
- * All the input and output of the original operator should be TensorCPU.
- *
- * Example usage: if you have a class MyMagicOp that is CPU based, and you use
- * the registration code
- *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
- * to register the CPU side, you can create its corresponding GPU operator
- * (with performance hits of course) via
- *     REGISTER_HIP_OPERATOR(MyMagic,
- *                            GPUFallbackOp<MyMagicOp>);
- *
- * Advanced usage: if you want to have some specific outputs never copied, you
- * can use the SkipOutputCopy template argument to do that. For example, if
- * MyMagic produces two outputs and the first output is always going to live on
- * the CPU, you can do
- *     REGISTER_HIP_OPERATOR(MyMagic,
- *                            GPUFallbackOp<MyMagicOp, SkipIndices<0>>);
- */
-template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
-class GPUFallbackOp final : public Operator<HIPContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(HIPContext);
-  GPUFallbackOp(const OperatorDef& def, Workspace* ws)
-      : Operator<HIPContext>(def, ws) {
-    CAFFE_ENFORCE_EQ(def.device_option().device_type(), HIP);
-    OperatorDef base_def_(def);
-    // base_def_ runs on CPU, so we will set its device option to CPU.
-    base_def_.clear_device_option();
-    base_def_.mutable_device_option()->set_device_type(CPU);
-    // Set up the symbols for the local workspace.
-    for (const string& name : def.input()) {
-      local_input_blobs_.push_back(local_ws_.CreateBlob(name));
-      CHECK_NOTNULL(local_input_blobs_.back());
-    }
-    base_op_.reset(new CPUOp(base_def_, &local_ws_));
-    for (const string& name : def.output()) {
-      local_output_blobs_.push_back(local_ws_.GetBlob(name));
-      CHECK_NOTNULL(local_output_blobs_.back());
-    }
-  }
-
-  bool RunOnDevice() override {
-    bool need_sync = false;
-    for (int i = 0; i < InputSize(); ++i) {
-      if (OperatorBase::InputIsType<TensorHIP>(i)) {
-        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
-            Input(i), &context_);
-        need_sync = true;
-      } else {
-        VLOG(1) << "Input " << i << " is not TensorHIP. Skipping copy.";
-        // Note(jiayq): This removes a const but conceptually
-        // local_input_blobs will only be used as const blob input for the
-        // base op so we are still fine.
-        local_input_blobs_[i]->ShareExternal(
-            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
-            OperatorBase::Inputs()[i]->meta());
-      }
-    }
-
-    // Sync to make sure copies are done.
-    if (need_sync) {
-      context_.FinishDeviceComputation();
-    }
-
-    if (!base_op_->Run()) {
-      LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
-                 << ProtoDebugString(this->debug_def());
-      return false;
-    }
-    for (int i = 0; i < OutputSize(); ++i) {
-      if (SkipOutputCopy::Contains(i)) {
-        VLOG(1) << "Copy output: index " << i << " skipped.";
-        continue;
-      }
-      CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<TensorCPU>(),
-          "GPU fallback op currently does not support non-TensorCPU "
-          "output type who needs copying.");
-      Output(i)->CopyFrom(
-          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
-    }
-    return true;
-  }
-
- protected:
-  Workspace local_ws_;
-  vector<Blob*> local_input_blobs_;
-  vector<Blob*> local_output_blobs_;
-  std::unique_ptr<CPUOp> base_op_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
diff --git a/caffe2/operators/hip/operator_fallback_hip_test.cc b/caffe2/operators/hip/operator_fallback_hip_test.cc
deleted file mode 100644
index 4a074c35f8..0000000000
--- a/caffe2/operators/hip/operator_fallback_hip_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-#include <iostream>
-
-#include <gtest/gtest.h>
-#include "caffe2/core/operator.h"
-#include "caffe2/operators/hip/operator_fallback_hip.h"
-
-namespace caffe2 {
-
-class IncrementByOneOp final : public Operator<CPUContext> {
- public:
-  IncrementByOneOp(const OperatorDef& def, Workspace* ws)
-      : Operator<CPUContext>(def, ws) {}
-  bool RunOnDevice() {
-    const auto& in = Input(0);
-    auto* out = Output(0);
-    out->ResizeLike(in);
-    const float* in_data = in.template data<float>();
-    float* out_data = out->template mutable_data<float>();
-    for (int i = 0; i < in.size(); ++i) {
-      out_data[i] = in_data[i] + 1.f;
-    }
-    return true;
-  }
-};
-
-OPERATOR_SCHEMA(IncrementByOne)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}});
-
-REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp);
-REGISTER_HIP_OPERATOR(IncrementByOne, GPUFallbackOp<IncrementByOneOp>);
-
-TEST(OperatorFallbackTest, IncrementByOneOp) {
-  OperatorDef op_def = CreateOperatorDef(
-      "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
-  Workspace ws;
-  TensorCPU source_tensor(vector<TIndex>{2, 3});
-  for (int i = 0; i < 6; ++i) {
-    source_tensor.mutable_data<float>()[i] = i;
-  }
-  ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
-  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
-  EXPECT_TRUE(op.get() != nullptr);
-  EXPECT_TRUE(op->Run());
-  const TensorCPU& output = ws.GetBlob("X")->Get<TensorCPU>();
-  EXPECT_EQ(output.ndim(), 2);
-  EXPECT_EQ(output.dim(0), 2);
-  EXPECT_EQ(output.dim(1), 3);
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(output.data<float>()[i], i + 1);
-  }
-}
-
-TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
-  if (!HasHipGPU())
-    return;
-  OperatorDef op_def = CreateOperatorDef(
-      "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
-  op_def.mutable_device_option()->set_device_type(HIP);
-  Workspace ws;
-  TensorCPU source_tensor(vector<TIndex>{2, 3});
-  for (int i = 0; i < 6; ++i) {
-    source_tensor.mutable_data<float>()[i] = i;
-  }
-  ws.CreateBlob("X")->GetMutable<TensorHIP>()->CopyFrom(source_tensor);
-  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
-  EXPECT_TRUE(op.get() != nullptr);
-  EXPECT_TRUE(op->Run());
-  const TensorHIP& output = ws.GetBlob("X")->Get<TensorHIP>();
-  TensorCPU output_cpu(output);
-  EXPECT_EQ(output.ndim(), 2);
-  EXPECT_EQ(output.dim(0), 2);
-  EXPECT_EQ(output.dim(1), 3);
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(output_cpu.data<float>()[i], i + 1);
-  }
-}
-
-} // namespace caffe2
author	Peter Yeh <pyeh@amd.com>	2018-07-19 00:11:20 -0700
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-07-19 00:26:06 -0700
commit	54db14e390f2d32e1212870b872c96fbca71c48a (patch)
tree	3c1430b42ce4800527e4ddec45cbf66cb3060fbc /caffe2/operators/hip
parent	45f0d05202f0506f39dad7e5cc537b5851dbb298 (diff)
download	pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.gz pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.bz2 pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.zip