diff options
author | Peter Yeh <pyeh@amd.com> | 2018-07-19 00:11:20 -0700 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-07-19 00:26:06 -0700 |
commit | 54db14e390f2d32e1212870b872c96fbca71c48a (patch) | |
tree | 3c1430b42ce4800527e4ddec45cbf66cb3060fbc /caffe2/operators/hip | |
parent | 45f0d05202f0506f39dad7e5cc537b5851dbb298 (diff) | |
download | pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.gz pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.bz2 pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.zip |
HIP Operators Generator--> HipOpG (#9322)
Summary:
The goal of this PR is to add an infrastructure; to convert(hipify) CUDA ops into [HIP](https://github.com/ROCm-Developer-Tools/HIP) ops , at **compile** time.
Note that HIP ops, which are portable c++ code, can run on AMD and NVIDIA platform.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9322
Differential Revision: D8884707
Pulled By: bddppq
fbshipit-source-id: dabc6319546002c308c10528238e6684f7aef0f8
Diffstat (limited to 'caffe2/operators/hip')
-rw-r--r-- | caffe2/operators/hip/operator_fallback_hip.h | 114 | ||||
-rw-r--r-- | caffe2/operators/hip/operator_fallback_hip_test.cc | 80 |
2 files changed, 0 insertions, 194 deletions
diff --git a/caffe2/operators/hip/operator_fallback_hip.h b/caffe2/operators/hip/operator_fallback_hip.h deleted file mode 100644 index 62e5fe8f01..0000000000 --- a/caffe2/operators/hip/operator_fallback_hip.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ -#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ - -#include "caffe2/core/common.h" -#include "caffe2/core/context.h" -#include "caffe2/core/hip/context_hip.h" -#include "caffe2/core/operator.h" -#include "caffe2/proto/caffe2.pb.h" - -namespace caffe2 { - -/** - * @brief A templated class to allow one to wrap a CPU operator as a CUDA - * operator. - * - * This class can be used when one does not have the CUDA implementation ready - * yet for an operator. Essentially, what this op does is to automatically - * deal with data copy for you. Plausibly, this causes a lot of overhead and - * is not optimal, so you should use this operator mostly for quick prototyping - * purpose. - * - * All the input and output of the original operator should be TensorCPU. - * - * Example usage: if you have a class MyMagicOp that is CPU based, and you use - * the registration code - * REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp); - * to register the CPU side, you can create its corresponding GPU operator - * (with performance hits of course) via - * REGISTER_HIP_OPERATOR(MyMagic, - * GPUFallbackOp<MyMagicOp>); - * - * Advanced usage: if you want to have some specific outputs never copied, you - * can use the SkipOutputCopy template argument to do that. For example, if - * MyMagic produces two outputs and the first output is always going to live on - * the CPU, you can do - * REGISTER_HIP_OPERATOR(MyMagic, - * GPUFallbackOp<MyMagicOp, SkipIndices<0>>); - */ -template <class CPUOp, typename SkipOutputCopy = SkipIndices<>> -class GPUFallbackOp final : public Operator<HIPContext> { - public: - USE_OPERATOR_FUNCTIONS(HIPContext); - GPUFallbackOp(const OperatorDef& def, Workspace* ws) - : Operator<HIPContext>(def, ws) { - CAFFE_ENFORCE_EQ(def.device_option().device_type(), HIP); - OperatorDef base_def_(def); - // base_def_ runs on CPU, so we will set its device option to CPU. - base_def_.clear_device_option(); - base_def_.mutable_device_option()->set_device_type(CPU); - // Set up the symbols for the local workspace. - for (const string& name : def.input()) { - local_input_blobs_.push_back(local_ws_.CreateBlob(name)); - CHECK_NOTNULL(local_input_blobs_.back()); - } - base_op_.reset(new CPUOp(base_def_, &local_ws_)); - for (const string& name : def.output()) { - local_output_blobs_.push_back(local_ws_.GetBlob(name)); - CHECK_NOTNULL(local_output_blobs_.back()); - } - } - - bool RunOnDevice() override { - bool need_sync = false; - for (int i = 0; i < InputSize(); ++i) { - if (OperatorBase::InputIsType<TensorHIP>(i)) { - local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom( - Input(i), &context_); - need_sync = true; - } else { - VLOG(1) << "Input " << i << " is not TensorHIP. Skipping copy."; - // Note(jiayq): This removes a const but conceptually - // local_input_blobs will only be used as const blob input for the - // base op so we are still fine. - local_input_blobs_[i]->ShareExternal( - const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()), - OperatorBase::Inputs()[i]->meta()); - } - } - - // Sync to make sure copies are done. - if (need_sync) { - context_.FinishDeviceComputation(); - } - - if (!base_op_->Run()) { - LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: " - << ProtoDebugString(this->debug_def()); - return false; - } - for (int i = 0; i < OutputSize(); ++i) { - if (SkipOutputCopy::Contains(i)) { - VLOG(1) << "Copy output: index " << i << " skipped."; - continue; - } - CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType<TensorCPU>(), - "GPU fallback op currently does not support non-TensorCPU " - "output type who needs copying."); - Output(i)->CopyFrom( - local_output_blobs_[i]->template Get<TensorCPU>(), &context_); - } - return true; - } - - protected: - Workspace local_ws_; - vector<Blob*> local_input_blobs_; - vector<Blob*> local_output_blobs_; - std::unique_ptr<CPUOp> base_op_; -}; - -} // namespace caffe2 - -#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ diff --git a/caffe2/operators/hip/operator_fallback_hip_test.cc b/caffe2/operators/hip/operator_fallback_hip_test.cc deleted file mode 100644 index 4a074c35f8..0000000000 --- a/caffe2/operators/hip/operator_fallback_hip_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include <iostream> - -#include <gtest/gtest.h> -#include "caffe2/core/operator.h" -#include "caffe2/operators/hip/operator_fallback_hip.h" - -namespace caffe2 { - -class IncrementByOneOp final : public Operator<CPUContext> { - public: - IncrementByOneOp(const OperatorDef& def, Workspace* ws) - : Operator<CPUContext>(def, ws) {} - bool RunOnDevice() { - const auto& in = Input(0); - auto* out = Output(0); - out->ResizeLike(in); - const float* in_data = in.template data<float>(); - float* out_data = out->template mutable_data<float>(); - for (int i = 0; i < in.size(); ++i) { - out_data[i] = in_data[i] + 1.f; - } - return true; - } -}; - -OPERATOR_SCHEMA(IncrementByOne) - .NumInputs(1) - .NumOutputs(1) - .AllowInplace({{0, 0}}); - -REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp); -REGISTER_HIP_OPERATOR(IncrementByOne, GPUFallbackOp<IncrementByOneOp>); - -TEST(OperatorFallbackTest, IncrementByOneOp) { - OperatorDef op_def = CreateOperatorDef( - "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"}); - Workspace ws; - TensorCPU source_tensor(vector<TIndex>{2, 3}); - for (int i = 0; i < 6; ++i) { - source_tensor.mutable_data<float>()[i] = i; - } - ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor); - unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws)); - EXPECT_TRUE(op.get() != nullptr); - EXPECT_TRUE(op->Run()); - const TensorCPU& output = ws.GetBlob("X")->Get<TensorCPU>(); - EXPECT_EQ(output.ndim(), 2); - EXPECT_EQ(output.dim(0), 2); - EXPECT_EQ(output.dim(1), 3); - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(output.data<float>()[i], i + 1); - } -} - -TEST(OperatorFallbackTest, GPUIncrementByOneOp) { - if (!HasHipGPU()) - return; - OperatorDef op_def = CreateOperatorDef( - "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"}); - op_def.mutable_device_option()->set_device_type(HIP); - Workspace ws; - TensorCPU source_tensor(vector<TIndex>{2, 3}); - for (int i = 0; i < 6; ++i) { - source_tensor.mutable_data<float>()[i] = i; - } - ws.CreateBlob("X")->GetMutable<TensorHIP>()->CopyFrom(source_tensor); - unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws)); - EXPECT_TRUE(op.get() != nullptr); - EXPECT_TRUE(op->Run()); - const TensorHIP& output = ws.GetBlob("X")->Get<TensorHIP>(); - TensorCPU output_cpu(output); - EXPECT_EQ(output.ndim(), 2); - EXPECT_EQ(output.dim(0), 2); - EXPECT_EQ(output.dim(1), 3); - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(output_cpu.data<float>()[i], i + 1); - } -} - -} // namespace caffe2 |