summaryrefslogtreecommitdiff
path: root/caffe2/operators/hip
diff options
context:
space:
mode:
authorPeter Yeh <pyeh@amd.com>2018-07-19 00:11:20 -0700
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-07-19 00:26:06 -0700
commit54db14e390f2d32e1212870b872c96fbca71c48a (patch)
tree3c1430b42ce4800527e4ddec45cbf66cb3060fbc /caffe2/operators/hip
parent45f0d05202f0506f39dad7e5cc537b5851dbb298 (diff)
downloadpytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.gz
pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.tar.bz2
pytorch-54db14e390f2d32e1212870b872c96fbca71c48a.zip
HIP Operators Generator--> HipOpG (#9322)
Summary: The goal of this PR is to add an infrastructure; to convert(hipify) CUDA ops into [HIP](https://github.com/ROCm-Developer-Tools/HIP) ops , at **compile** time. Note that HIP ops, which are portable c++ code, can run on AMD and NVIDIA platform. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9322 Differential Revision: D8884707 Pulled By: bddppq fbshipit-source-id: dabc6319546002c308c10528238e6684f7aef0f8
Diffstat (limited to 'caffe2/operators/hip')
-rw-r--r--caffe2/operators/hip/operator_fallback_hip.h114
-rw-r--r--caffe2/operators/hip/operator_fallback_hip_test.cc80
2 files changed, 0 insertions, 194 deletions
diff --git a/caffe2/operators/hip/operator_fallback_hip.h b/caffe2/operators/hip/operator_fallback_hip.h
deleted file mode 100644
index 62e5fe8f01..0000000000
--- a/caffe2/operators/hip/operator_fallback_hip.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
-#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/hip/context_hip.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/proto/caffe2.pb.h"
-
-namespace caffe2 {
-
-/**
- * @brief A templated class to allow one to wrap a CPU operator as a CUDA
- * operator.
- *
- * This class can be used when one does not have the CUDA implementation ready
- * yet for an operator. Essentially, what this op does is to automatically
- * deal with data copy for you. Plausibly, this causes a lot of overhead and
- * is not optimal, so you should use this operator mostly for quick prototyping
- * purpose.
- *
- * All the input and output of the original operator should be TensorCPU.
- *
- * Example usage: if you have a class MyMagicOp that is CPU based, and you use
- * the registration code
- * REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
- * to register the CPU side, you can create its corresponding GPU operator
- * (with performance hits of course) via
- * REGISTER_HIP_OPERATOR(MyMagic,
- * GPUFallbackOp<MyMagicOp>);
- *
- * Advanced usage: if you want to have some specific outputs never copied, you
- * can use the SkipOutputCopy template argument to do that. For example, if
- * MyMagic produces two outputs and the first output is always going to live on
- * the CPU, you can do
- * REGISTER_HIP_OPERATOR(MyMagic,
- * GPUFallbackOp<MyMagicOp, SkipIndices<0>>);
- */
-template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
-class GPUFallbackOp final : public Operator<HIPContext> {
- public:
- USE_OPERATOR_FUNCTIONS(HIPContext);
- GPUFallbackOp(const OperatorDef& def, Workspace* ws)
- : Operator<HIPContext>(def, ws) {
- CAFFE_ENFORCE_EQ(def.device_option().device_type(), HIP);
- OperatorDef base_def_(def);
- // base_def_ runs on CPU, so we will set its device option to CPU.
- base_def_.clear_device_option();
- base_def_.mutable_device_option()->set_device_type(CPU);
- // Set up the symbols for the local workspace.
- for (const string& name : def.input()) {
- local_input_blobs_.push_back(local_ws_.CreateBlob(name));
- CHECK_NOTNULL(local_input_blobs_.back());
- }
- base_op_.reset(new CPUOp(base_def_, &local_ws_));
- for (const string& name : def.output()) {
- local_output_blobs_.push_back(local_ws_.GetBlob(name));
- CHECK_NOTNULL(local_output_blobs_.back());
- }
- }
-
- bool RunOnDevice() override {
- bool need_sync = false;
- for (int i = 0; i < InputSize(); ++i) {
- if (OperatorBase::InputIsType<TensorHIP>(i)) {
- local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
- Input(i), &context_);
- need_sync = true;
- } else {
- VLOG(1) << "Input " << i << " is not TensorHIP. Skipping copy.";
- // Note(jiayq): This removes a const but conceptually
- // local_input_blobs will only be used as const blob input for the
- // base op so we are still fine.
- local_input_blobs_[i]->ShareExternal(
- const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
- OperatorBase::Inputs()[i]->meta());
- }
- }
-
- // Sync to make sure copies are done.
- if (need_sync) {
- context_.FinishDeviceComputation();
- }
-
- if (!base_op_->Run()) {
- LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
- << ProtoDebugString(this->debug_def());
- return false;
- }
- for (int i = 0; i < OutputSize(); ++i) {
- if (SkipOutputCopy::Contains(i)) {
- VLOG(1) << "Copy output: index " << i << " skipped.";
- continue;
- }
- CAFFE_ENFORCE(
- local_output_blobs_[i]->template IsType<TensorCPU>(),
- "GPU fallback op currently does not support non-TensorCPU "
- "output type who needs copying.");
- Output(i)->CopyFrom(
- local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
- }
- return true;
- }
-
- protected:
- Workspace local_ws_;
- vector<Blob*> local_input_blobs_;
- vector<Blob*> local_output_blobs_;
- std::unique_ptr<CPUOp> base_op_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
diff --git a/caffe2/operators/hip/operator_fallback_hip_test.cc b/caffe2/operators/hip/operator_fallback_hip_test.cc
deleted file mode 100644
index 4a074c35f8..0000000000
--- a/caffe2/operators/hip/operator_fallback_hip_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-#include <iostream>
-
-#include <gtest/gtest.h>
-#include "caffe2/core/operator.h"
-#include "caffe2/operators/hip/operator_fallback_hip.h"
-
-namespace caffe2 {
-
-class IncrementByOneOp final : public Operator<CPUContext> {
- public:
- IncrementByOneOp(const OperatorDef& def, Workspace* ws)
- : Operator<CPUContext>(def, ws) {}
- bool RunOnDevice() {
- const auto& in = Input(0);
- auto* out = Output(0);
- out->ResizeLike(in);
- const float* in_data = in.template data<float>();
- float* out_data = out->template mutable_data<float>();
- for (int i = 0; i < in.size(); ++i) {
- out_data[i] = in_data[i] + 1.f;
- }
- return true;
- }
-};
-
-OPERATOR_SCHEMA(IncrementByOne)
- .NumInputs(1)
- .NumOutputs(1)
- .AllowInplace({{0, 0}});
-
-REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp);
-REGISTER_HIP_OPERATOR(IncrementByOne, GPUFallbackOp<IncrementByOneOp>);
-
-TEST(OperatorFallbackTest, IncrementByOneOp) {
- OperatorDef op_def = CreateOperatorDef(
- "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
- Workspace ws;
- TensorCPU source_tensor(vector<TIndex>{2, 3});
- for (int i = 0; i < 6; ++i) {
- source_tensor.mutable_data<float>()[i] = i;
- }
- ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
- unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
- EXPECT_TRUE(op.get() != nullptr);
- EXPECT_TRUE(op->Run());
- const TensorCPU& output = ws.GetBlob("X")->Get<TensorCPU>();
- EXPECT_EQ(output.ndim(), 2);
- EXPECT_EQ(output.dim(0), 2);
- EXPECT_EQ(output.dim(1), 3);
- for (int i = 0; i < 6; ++i) {
- EXPECT_EQ(output.data<float>()[i], i + 1);
- }
-}
-
-TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
- if (!HasHipGPU())
- return;
- OperatorDef op_def = CreateOperatorDef(
- "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
- op_def.mutable_device_option()->set_device_type(HIP);
- Workspace ws;
- TensorCPU source_tensor(vector<TIndex>{2, 3});
- for (int i = 0; i < 6; ++i) {
- source_tensor.mutable_data<float>()[i] = i;
- }
- ws.CreateBlob("X")->GetMutable<TensorHIP>()->CopyFrom(source_tensor);
- unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
- EXPECT_TRUE(op.get() != nullptr);
- EXPECT_TRUE(op->Run());
- const TensorHIP& output = ws.GetBlob("X")->Get<TensorHIP>();
- TensorCPU output_cpu(output);
- EXPECT_EQ(output.ndim(), 2);
- EXPECT_EQ(output.dim(0), 2);
- EXPECT_EQ(output.dim(1), 3);
- for (int i = 0; i < 6; ++i) {
- EXPECT_EQ(output_cpu.data<float>()[i], i + 1);
- }
-}
-
-} // namespace caffe2