diff options
author | Hao Lu <hlu@fb.com> | 2017-09-11 16:25:03 -0700 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2017-09-11 16:32:41 -0700 |
commit | 98173850b2ee6d7be9da177cb55913f571826745 (patch) | |
tree | 37f10987fd1f34096d27ca8e70ddb177c5628098 | |
parent | ebf7784840b8b23627f41d0906813cbffe0f7082 (diff) | |
download | pytorch-98173850b2ee6d7be9da177cb55913f571826745.tar.gz pytorch-98173850b2ee6d7be9da177cb55913f571826745.tar.bz2 pytorch-98173850b2ee6d7be9da177cb55913f571826745.zip |
Make iOS segmentation net run with OpenGL
Reviewed By: fricc33
Differential Revision: D5803411
fbshipit-source-id: d208771d59f99b4f95ce67849baf369c14e66b37
-rw-r--r-- | caffe2/mobile/caffe2/mobile/contrib/opengl/core/rewrite_net.cc | 368 | ||||
-rw-r--r-- | caffe2/mobile/contrib/opengl/core/rewrite_net.cc | 8 | ||||
-rw-r--r-- | caffe2/share/caffe2/mobile/contrib/opengl/core/rewrite_net.cc | 368 | ||||
-rw-r--r-- | caffe2/share/contrib/opengl/core/rewrite_net.cc | 368 | ||||
-rw-r--r-- | contrib/opengl/core/rewrite_net.cc | 368 |
5 files changed, 1480 insertions, 0 deletions
diff --git a/caffe2/mobile/caffe2/mobile/contrib/opengl/core/rewrite_net.cc b/caffe2/mobile/caffe2/mobile/contrib/opengl/core/rewrite_net.cc new file mode 100644 index 0000000000..c69cb7b947 --- /dev/null +++ b/caffe2/mobile/caffe2/mobile/contrib/opengl/core/rewrite_net.cc @@ -0,0 +1,368 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "rewrite_net.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/proto_utils.h" +#include <unordered_map> +#include <unordered_set> + +#ifdef CAFFE2_ANDROID +#include "../android/AndroidGLContext.h" +#endif + +namespace caffe2 { + +struct Analysis { + struct SSA { + using BlobVersions = std::unordered_map<std::string, size_t>; + BlobVersions inVersions; + BlobVersions outVersions; + }; + std::vector<SSA> ssa; + std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages; +}; + +static Analysis analyzeNet(const NetDef& net) { + Analysis::SSA::BlobVersions frontier; + Analysis analysis; + + auto play = [&](size_t i, const OperatorDef& op) { + Analysis::SSA::BlobVersions inVersions; + for (const auto& s : op.input()) { + inVersions[s] = frontier[s]; + analysis.inUsages[s][frontier[s]].push_back(i); + } + Analysis::SSA::BlobVersions outVersions; + for (const auto& s : op.output()) { + if (frontier.find(s) != frontier.end()) { + frontier[s] += 1; + } + outVersions[s] = frontier[s]; + } + analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions}); + }; + + for (auto i = 0; i < net.op_size(); ++i) { + play(i, net.op(i)); + } + return analysis; +} + +static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + auto* op = predictNet.add_op(); + op->set_name("CopyToOpenGL"); + op->set_type("CopyToOpenGL"); + op->add_input(cpu_blob); + op->add_output(cpu_blob + "_M"); +} + +static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + // add argument "is_last" to the last op to signal this is the last operator before the + // CopyFromOpenGL op + auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1); + auto* arg = last_op->add_arg(); + arg->set_name("is_last"); + arg->set_i(1); + + auto* op = predictNet.add_op(); + op->set_name("CopyFromOpenGL"); + op->set_type("CopyFromOpenGL"); + op->add_input(cpu_blob + "_M"); + op->add_output(cpu_blob); +} + +static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) { + // Do some validation of the outputs. For this version, we require: + // - a single input (first element of external_input()) is consumed by the NetDef + // - a single output (first element of external_output()) is produced by the NetDef. + // - the input is consumed by def.op(0), and this is the only consumer. + // - the output is produced by def.op(-1). + CAFFE_ENFORCE_GE(def.external_input_size(), 1); + CAFFE_ENFORCE_GE(def.external_output_size(), 1); + auto analysis = analyzeNet(def); + // enforce a single use of the input blob. + CAFFE_ENFORCE_GE(def.op_size(), 1); + + const auto& inputBlob = def.external_input(0); + // Enforce that the input blob has a single usage - in the first operator. + CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0})); + // Enforce that the external_output(0) blob is produced by the last operator in this sequence. + const auto& outputBlob = def.external_output(0); + CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != + analysis.ssa.back().outVersions.end()); + const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob]; + // This should hold true by definition of the SSA analysis. + CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == + analysis.inUsages[outputBlob].end()); + + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + + std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs; + cpu_blobs[def.external_input(0)].insert(0); + + for (auto i = 0; i < def.op_size(); i++) { + const auto& currentOp = def.op(i); + if (glOps.count(currentOp.type()) > 0) { + // OpenGL Op + // insert copyToOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (cpu_blobs[input].count(version) > 0) { + insertCopyToGPUOp(mdef, input); + gpu_blobs[input].insert(version); + cpu_blobs[input].erase(version); + } + // Only the first input should be OpenGL texture + // Otherwise, copyToOpenGLOp will be inserted for the weights, + // which are outputs of QuantDecode + if (currentOp.type().find("OpenGLConv") == 0) { + if (j == 0) { + break; + } + } + } + + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + + // swap input blob + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + op->set_input(j, input + "_M"); + } + } + + // swap output blob + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + op->set_output(j, output + "_M"); + gpu_blobs[output].insert(version); + } + // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op + if (i == def.op_size() - 1) { + insertCopyFromGPUOp(mdef, currentOp.output(0)); + } + } else { + // CPU Op + // insert copyFromOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + insertCopyFromGPUOp(mdef, input); + } + } + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + cpu_blobs[output].insert(version); + } + } + } + return mdef; +} + +static bool tryFuseAdjacentOps(const OperatorDef& currentOp, + const OperatorDef& nextOp, + OperatorDef* fusedOp, + std::unordered_set<std::string>& glOps) { + // Check for possible invalid opportunities. + if (currentOp.output_size() != 1 || nextOp.output_size() != 1) { + return false; + } + // The fused op cannot be inplace + if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) { + return false; + } + + static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = { + {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"}, + {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"}, + {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"}, + {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}}; + auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()}); + if (it == fusionOpportunities.end()) { + return false; + } + + glOps.insert(it->second); + fusedOp->CopyFrom(currentOp); + fusedOp->set_output(0, nextOp.output(0)); + fusedOp->set_type(it->second); + for (auto i = 1; i < nextOp.input_size(); i++) { + fusedOp->add_input(nextOp.input(i)); + } + return true; +} + +static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) { + CHECK_GE(def.op_size(), 1); + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + auto i = 0; + + while (i < def.op_size()) { + if (i == def.op_size() - 1) { + VLOG(2) << "Last operator, skipping"; + auto* op = mdef.add_op(); + op->CopyFrom(def.op(i)); + i += 1; + continue; + } + + const auto& currentOp = def.op(i); + const auto& nextOp = def.op(i + 1); + OperatorDef fusedOp; + if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) { + VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type(); + // We can fuse. + auto* op = mdef.add_op(); + op->CopyFrom(fusedOp); + i += 2; + continue; + } + VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type(); + // Just emit the current type. + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + i += 1; + } + return mdef; +} + +void dumpDefForOpenGL(const NetDef& d) { + for (const auto& op : d.op()) { + LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0); + } +} + +// // For debugging +// void dumpDefForOpenGL(const NetDef &net) { +// for (const auto &op : net.op()) { +// printf("***Operator: %s\n", op.type().c_str()); +// for (auto input : op.input()) { +// printf("\tInput: %s\n", input.c_str()); +// } +// +// for (auto output : op.output()) { +// printf("\tOutput: %s\n", output.c_str()); +// } +// } +//} + +NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) { + CAFFE_ENFORCE_GE(predictNet.op_size(), 1); + NetDef net; + net.CopyFrom(predictNet); + + std::unordered_map<std::string, std::string> replacements( + {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess" + : "OpenGLTensorToTextureStylizerPreprocess"}, + {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess" + : "OpenGLTextureToTensorStylizerDeprocess"}}); + + std::unordered_set<std::string> openGLOps; // Used to insert copy ops + bool needCopyOps = false; + + const auto& opKeyList = CPUOperatorRegistry()->Keys(); + auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end()); + +#ifdef CAFFE2_ANDROID + // TODO: debug InstanceNorm models on Mali devices + AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext(); + if (context->get_platform() == Mali) { + opKeySet.erase("OpenGLInstanceNorm"); + opKeySet.erase("OpenGLInstanceNormPRelu"); + } +#endif + for (auto i = 0; i < net.op_size(); ++i) { + auto* op = net.mutable_op(i); + string openGLOp = std::string("OpenGL") + op->type(); + if (replacements.count(openGLOp) > 0) { + openGLOp = replacements[openGLOp]; + } + + if (opKeySet.find(openGLOp) != opKeySet.end()) { + op->set_type(openGLOp); + openGLOps.insert(openGLOp); + + if (useTiling) { + auto* arg = op->add_arg(); + arg->set_name("tiling"); + arg->set_i(1); + } + } else { + needCopyOps = true; + } + } + + if (useTextureInput && needCopyOps) { + CAFFE_THROW("OpenGL operator missing"); + } + + if (runFusion) { + net = runOpenGLFusion(net, openGLOps); + } + + if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) { + // For end-to-end testing + if (net.op(net.op_size() - 1).type() != + replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) { + auto* last_op = net.mutable_op(net.op_size() - 1); + auto output = last_op->output(0) + "M"; + last_op->set_output(0, output); + auto* copy_op = net.add_op(); + copy_op->set_name("CopyFromOpenGL"); + copy_op->set_type("CopyFromOpenGL"); + copy_op->add_input(output); + // rename output blob in case input and output blob has the same name + copy_op->add_output(net.external_output(0)); + } + } else { + if (!useTextureInput) { + needCopyOps = true; + } + } + + // copy ops are needed when the input is not a texture + if (needCopyOps) { + // For non style transfer cases + net = insertInputOutputCopyOps(net, openGLOps); + } + + return net; +} + +bool tryConvertToOpenGL(const NetDef& initNet, + const NetDef& predictNet, + NetDef* glPredictNet, + bool useTextureInput, + bool useTiling, + bool runFusion) { + try { + // Throws if unsupported operators are found. + *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion); + dumpDefForOpenGL(*glPredictNet); + // Throws if unsupported parameters are found. + Workspace ws; + ws.RunNetOnce(initNet); + ws.CreateNet(*glPredictNet); + LOG(INFO) << "OpenGL is successfully enabled"; + return true; + } catch (const std::exception& e) { + LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what(); + return false; + } +} +} // namespace caffe2 diff --git a/caffe2/mobile/contrib/opengl/core/rewrite_net.cc b/caffe2/mobile/contrib/opengl/core/rewrite_net.cc index 569312a33b..c69cb7b947 100644 --- a/caffe2/mobile/contrib/opengl/core/rewrite_net.cc +++ b/caffe2/mobile/contrib/opengl/core/rewrite_net.cc @@ -115,6 +115,14 @@ static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std gpu_blobs[input].insert(version); cpu_blobs[input].erase(version); } + // Only the first input should be OpenGL texture + // Otherwise, copyToOpenGLOp will be inserted for the weights, + // which are outputs of QuantDecode + if (currentOp.type().find("OpenGLConv") == 0) { + if (j == 0) { + break; + } + } } auto* op = mdef.add_op(); diff --git a/caffe2/share/caffe2/mobile/contrib/opengl/core/rewrite_net.cc b/caffe2/share/caffe2/mobile/contrib/opengl/core/rewrite_net.cc new file mode 100644 index 0000000000..c69cb7b947 --- /dev/null +++ b/caffe2/share/caffe2/mobile/contrib/opengl/core/rewrite_net.cc @@ -0,0 +1,368 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "rewrite_net.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/proto_utils.h" +#include <unordered_map> +#include <unordered_set> + +#ifdef CAFFE2_ANDROID +#include "../android/AndroidGLContext.h" +#endif + +namespace caffe2 { + +struct Analysis { + struct SSA { + using BlobVersions = std::unordered_map<std::string, size_t>; + BlobVersions inVersions; + BlobVersions outVersions; + }; + std::vector<SSA> ssa; + std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages; +}; + +static Analysis analyzeNet(const NetDef& net) { + Analysis::SSA::BlobVersions frontier; + Analysis analysis; + + auto play = [&](size_t i, const OperatorDef& op) { + Analysis::SSA::BlobVersions inVersions; + for (const auto& s : op.input()) { + inVersions[s] = frontier[s]; + analysis.inUsages[s][frontier[s]].push_back(i); + } + Analysis::SSA::BlobVersions outVersions; + for (const auto& s : op.output()) { + if (frontier.find(s) != frontier.end()) { + frontier[s] += 1; + } + outVersions[s] = frontier[s]; + } + analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions}); + }; + + for (auto i = 0; i < net.op_size(); ++i) { + play(i, net.op(i)); + } + return analysis; +} + +static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + auto* op = predictNet.add_op(); + op->set_name("CopyToOpenGL"); + op->set_type("CopyToOpenGL"); + op->add_input(cpu_blob); + op->add_output(cpu_blob + "_M"); +} + +static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + // add argument "is_last" to the last op to signal this is the last operator before the + // CopyFromOpenGL op + auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1); + auto* arg = last_op->add_arg(); + arg->set_name("is_last"); + arg->set_i(1); + + auto* op = predictNet.add_op(); + op->set_name("CopyFromOpenGL"); + op->set_type("CopyFromOpenGL"); + op->add_input(cpu_blob + "_M"); + op->add_output(cpu_blob); +} + +static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) { + // Do some validation of the outputs. For this version, we require: + // - a single input (first element of external_input()) is consumed by the NetDef + // - a single output (first element of external_output()) is produced by the NetDef. + // - the input is consumed by def.op(0), and this is the only consumer. + // - the output is produced by def.op(-1). + CAFFE_ENFORCE_GE(def.external_input_size(), 1); + CAFFE_ENFORCE_GE(def.external_output_size(), 1); + auto analysis = analyzeNet(def); + // enforce a single use of the input blob. + CAFFE_ENFORCE_GE(def.op_size(), 1); + + const auto& inputBlob = def.external_input(0); + // Enforce that the input blob has a single usage - in the first operator. + CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0})); + // Enforce that the external_output(0) blob is produced by the last operator in this sequence. + const auto& outputBlob = def.external_output(0); + CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != + analysis.ssa.back().outVersions.end()); + const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob]; + // This should hold true by definition of the SSA analysis. + CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == + analysis.inUsages[outputBlob].end()); + + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + + std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs; + cpu_blobs[def.external_input(0)].insert(0); + + for (auto i = 0; i < def.op_size(); i++) { + const auto& currentOp = def.op(i); + if (glOps.count(currentOp.type()) > 0) { + // OpenGL Op + // insert copyToOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (cpu_blobs[input].count(version) > 0) { + insertCopyToGPUOp(mdef, input); + gpu_blobs[input].insert(version); + cpu_blobs[input].erase(version); + } + // Only the first input should be OpenGL texture + // Otherwise, copyToOpenGLOp will be inserted for the weights, + // which are outputs of QuantDecode + if (currentOp.type().find("OpenGLConv") == 0) { + if (j == 0) { + break; + } + } + } + + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + + // swap input blob + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + op->set_input(j, input + "_M"); + } + } + + // swap output blob + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + op->set_output(j, output + "_M"); + gpu_blobs[output].insert(version); + } + // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op + if (i == def.op_size() - 1) { + insertCopyFromGPUOp(mdef, currentOp.output(0)); + } + } else { + // CPU Op + // insert copyFromOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + insertCopyFromGPUOp(mdef, input); + } + } + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + cpu_blobs[output].insert(version); + } + } + } + return mdef; +} + +static bool tryFuseAdjacentOps(const OperatorDef& currentOp, + const OperatorDef& nextOp, + OperatorDef* fusedOp, + std::unordered_set<std::string>& glOps) { + // Check for possible invalid opportunities. + if (currentOp.output_size() != 1 || nextOp.output_size() != 1) { + return false; + } + // The fused op cannot be inplace + if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) { + return false; + } + + static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = { + {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"}, + {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"}, + {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"}, + {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}}; + auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()}); + if (it == fusionOpportunities.end()) { + return false; + } + + glOps.insert(it->second); + fusedOp->CopyFrom(currentOp); + fusedOp->set_output(0, nextOp.output(0)); + fusedOp->set_type(it->second); + for (auto i = 1; i < nextOp.input_size(); i++) { + fusedOp->add_input(nextOp.input(i)); + } + return true; +} + +static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) { + CHECK_GE(def.op_size(), 1); + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + auto i = 0; + + while (i < def.op_size()) { + if (i == def.op_size() - 1) { + VLOG(2) << "Last operator, skipping"; + auto* op = mdef.add_op(); + op->CopyFrom(def.op(i)); + i += 1; + continue; + } + + const auto& currentOp = def.op(i); + const auto& nextOp = def.op(i + 1); + OperatorDef fusedOp; + if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) { + VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type(); + // We can fuse. + auto* op = mdef.add_op(); + op->CopyFrom(fusedOp); + i += 2; + continue; + } + VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type(); + // Just emit the current type. + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + i += 1; + } + return mdef; +} + +void dumpDefForOpenGL(const NetDef& d) { + for (const auto& op : d.op()) { + LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0); + } +} + +// // For debugging +// void dumpDefForOpenGL(const NetDef &net) { +// for (const auto &op : net.op()) { +// printf("***Operator: %s\n", op.type().c_str()); +// for (auto input : op.input()) { +// printf("\tInput: %s\n", input.c_str()); +// } +// +// for (auto output : op.output()) { +// printf("\tOutput: %s\n", output.c_str()); +// } +// } +//} + +NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) { + CAFFE_ENFORCE_GE(predictNet.op_size(), 1); + NetDef net; + net.CopyFrom(predictNet); + + std::unordered_map<std::string, std::string> replacements( + {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess" + : "OpenGLTensorToTextureStylizerPreprocess"}, + {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess" + : "OpenGLTextureToTensorStylizerDeprocess"}}); + + std::unordered_set<std::string> openGLOps; // Used to insert copy ops + bool needCopyOps = false; + + const auto& opKeyList = CPUOperatorRegistry()->Keys(); + auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end()); + +#ifdef CAFFE2_ANDROID + // TODO: debug InstanceNorm models on Mali devices + AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext(); + if (context->get_platform() == Mali) { + opKeySet.erase("OpenGLInstanceNorm"); + opKeySet.erase("OpenGLInstanceNormPRelu"); + } +#endif + for (auto i = 0; i < net.op_size(); ++i) { + auto* op = net.mutable_op(i); + string openGLOp = std::string("OpenGL") + op->type(); + if (replacements.count(openGLOp) > 0) { + openGLOp = replacements[openGLOp]; + } + + if (opKeySet.find(openGLOp) != opKeySet.end()) { + op->set_type(openGLOp); + openGLOps.insert(openGLOp); + + if (useTiling) { + auto* arg = op->add_arg(); + arg->set_name("tiling"); + arg->set_i(1); + } + } else { + needCopyOps = true; + } + } + + if (useTextureInput && needCopyOps) { + CAFFE_THROW("OpenGL operator missing"); + } + + if (runFusion) { + net = runOpenGLFusion(net, openGLOps); + } + + if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) { + // For end-to-end testing + if (net.op(net.op_size() - 1).type() != + replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) { + auto* last_op = net.mutable_op(net.op_size() - 1); + auto output = last_op->output(0) + "M"; + last_op->set_output(0, output); + auto* copy_op = net.add_op(); + copy_op->set_name("CopyFromOpenGL"); + copy_op->set_type("CopyFromOpenGL"); + copy_op->add_input(output); + // rename output blob in case input and output blob has the same name + copy_op->add_output(net.external_output(0)); + } + } else { + if (!useTextureInput) { + needCopyOps = true; + } + } + + // copy ops are needed when the input is not a texture + if (needCopyOps) { + // For non style transfer cases + net = insertInputOutputCopyOps(net, openGLOps); + } + + return net; +} + +bool tryConvertToOpenGL(const NetDef& initNet, + const NetDef& predictNet, + NetDef* glPredictNet, + bool useTextureInput, + bool useTiling, + bool runFusion) { + try { + // Throws if unsupported operators are found. + *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion); + dumpDefForOpenGL(*glPredictNet); + // Throws if unsupported parameters are found. + Workspace ws; + ws.RunNetOnce(initNet); + ws.CreateNet(*glPredictNet); + LOG(INFO) << "OpenGL is successfully enabled"; + return true; + } catch (const std::exception& e) { + LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what(); + return false; + } +} +} // namespace caffe2 diff --git a/caffe2/share/contrib/opengl/core/rewrite_net.cc b/caffe2/share/contrib/opengl/core/rewrite_net.cc new file mode 100644 index 0000000000..c69cb7b947 --- /dev/null +++ b/caffe2/share/contrib/opengl/core/rewrite_net.cc @@ -0,0 +1,368 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "rewrite_net.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/proto_utils.h" +#include <unordered_map> +#include <unordered_set> + +#ifdef CAFFE2_ANDROID +#include "../android/AndroidGLContext.h" +#endif + +namespace caffe2 { + +struct Analysis { + struct SSA { + using BlobVersions = std::unordered_map<std::string, size_t>; + BlobVersions inVersions; + BlobVersions outVersions; + }; + std::vector<SSA> ssa; + std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages; +}; + +static Analysis analyzeNet(const NetDef& net) { + Analysis::SSA::BlobVersions frontier; + Analysis analysis; + + auto play = [&](size_t i, const OperatorDef& op) { + Analysis::SSA::BlobVersions inVersions; + for (const auto& s : op.input()) { + inVersions[s] = frontier[s]; + analysis.inUsages[s][frontier[s]].push_back(i); + } + Analysis::SSA::BlobVersions outVersions; + for (const auto& s : op.output()) { + if (frontier.find(s) != frontier.end()) { + frontier[s] += 1; + } + outVersions[s] = frontier[s]; + } + analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions}); + }; + + for (auto i = 0; i < net.op_size(); ++i) { + play(i, net.op(i)); + } + return analysis; +} + +static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + auto* op = predictNet.add_op(); + op->set_name("CopyToOpenGL"); + op->set_type("CopyToOpenGL"); + op->add_input(cpu_blob); + op->add_output(cpu_blob + "_M"); +} + +static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + // add argument "is_last" to the last op to signal this is the last operator before the + // CopyFromOpenGL op + auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1); + auto* arg = last_op->add_arg(); + arg->set_name("is_last"); + arg->set_i(1); + + auto* op = predictNet.add_op(); + op->set_name("CopyFromOpenGL"); + op->set_type("CopyFromOpenGL"); + op->add_input(cpu_blob + "_M"); + op->add_output(cpu_blob); +} + +static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) { + // Do some validation of the outputs. For this version, we require: + // - a single input (first element of external_input()) is consumed by the NetDef + // - a single output (first element of external_output()) is produced by the NetDef. + // - the input is consumed by def.op(0), and this is the only consumer. + // - the output is produced by def.op(-1). + CAFFE_ENFORCE_GE(def.external_input_size(), 1); + CAFFE_ENFORCE_GE(def.external_output_size(), 1); + auto analysis = analyzeNet(def); + // enforce a single use of the input blob. + CAFFE_ENFORCE_GE(def.op_size(), 1); + + const auto& inputBlob = def.external_input(0); + // Enforce that the input blob has a single usage - in the first operator. + CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0})); + // Enforce that the external_output(0) blob is produced by the last operator in this sequence. + const auto& outputBlob = def.external_output(0); + CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != + analysis.ssa.back().outVersions.end()); + const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob]; + // This should hold true by definition of the SSA analysis. + CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == + analysis.inUsages[outputBlob].end()); + + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + + std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs; + cpu_blobs[def.external_input(0)].insert(0); + + for (auto i = 0; i < def.op_size(); i++) { + const auto& currentOp = def.op(i); + if (glOps.count(currentOp.type()) > 0) { + // OpenGL Op + // insert copyToOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (cpu_blobs[input].count(version) > 0) { + insertCopyToGPUOp(mdef, input); + gpu_blobs[input].insert(version); + cpu_blobs[input].erase(version); + } + // Only the first input should be OpenGL texture + // Otherwise, copyToOpenGLOp will be inserted for the weights, + // which are outputs of QuantDecode + if (currentOp.type().find("OpenGLConv") == 0) { + if (j == 0) { + break; + } + } + } + + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + + // swap input blob + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + op->set_input(j, input + "_M"); + } + } + + // swap output blob + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + op->set_output(j, output + "_M"); + gpu_blobs[output].insert(version); + } + // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op + if (i == def.op_size() - 1) { + insertCopyFromGPUOp(mdef, currentOp.output(0)); + } + } else { + // CPU Op + // insert copyFromOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + insertCopyFromGPUOp(mdef, input); + } + } + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + cpu_blobs[output].insert(version); + } + } + } + return mdef; +} + +static bool tryFuseAdjacentOps(const OperatorDef& currentOp, + const OperatorDef& nextOp, + OperatorDef* fusedOp, + std::unordered_set<std::string>& glOps) { + // Check for possible invalid opportunities. + if (currentOp.output_size() != 1 || nextOp.output_size() != 1) { + return false; + } + // The fused op cannot be inplace + if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) { + return false; + } + + static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = { + {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"}, + {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"}, + {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"}, + {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}}; + auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()}); + if (it == fusionOpportunities.end()) { + return false; + } + + glOps.insert(it->second); + fusedOp->CopyFrom(currentOp); + fusedOp->set_output(0, nextOp.output(0)); + fusedOp->set_type(it->second); + for (auto i = 1; i < nextOp.input_size(); i++) { + fusedOp->add_input(nextOp.input(i)); + } + return true; +} + +static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) { + CHECK_GE(def.op_size(), 1); + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + auto i = 0; + + while (i < def.op_size()) { + if (i == def.op_size() - 1) { + VLOG(2) << "Last operator, skipping"; + auto* op = mdef.add_op(); + op->CopyFrom(def.op(i)); + i += 1; + continue; + } + + const auto& currentOp = def.op(i); + const auto& nextOp = def.op(i + 1); + OperatorDef fusedOp; + if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) { + VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type(); + // We can fuse. + auto* op = mdef.add_op(); + op->CopyFrom(fusedOp); + i += 2; + continue; + } + VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type(); + // Just emit the current type. + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + i += 1; + } + return mdef; +} + +void dumpDefForOpenGL(const NetDef& d) { + for (const auto& op : d.op()) { + LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0); + } +} + +// // For debugging +// void dumpDefForOpenGL(const NetDef &net) { +// for (const auto &op : net.op()) { +// printf("***Operator: %s\n", op.type().c_str()); +// for (auto input : op.input()) { +// printf("\tInput: %s\n", input.c_str()); +// } +// +// for (auto output : op.output()) { +// printf("\tOutput: %s\n", output.c_str()); +// } +// } +//} + +NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) { + CAFFE_ENFORCE_GE(predictNet.op_size(), 1); + NetDef net; + net.CopyFrom(predictNet); + + std::unordered_map<std::string, std::string> replacements( + {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess" + : "OpenGLTensorToTextureStylizerPreprocess"}, + {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess" + : "OpenGLTextureToTensorStylizerDeprocess"}}); + + std::unordered_set<std::string> openGLOps; // Used to insert copy ops + bool needCopyOps = false; + + const auto& opKeyList = CPUOperatorRegistry()->Keys(); + auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end()); + +#ifdef CAFFE2_ANDROID + // TODO: debug InstanceNorm models on Mali devices + AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext(); + if (context->get_platform() == Mali) { + opKeySet.erase("OpenGLInstanceNorm"); + opKeySet.erase("OpenGLInstanceNormPRelu"); + } +#endif + for (auto i = 0; i < net.op_size(); ++i) { + auto* op = net.mutable_op(i); + string openGLOp = std::string("OpenGL") + op->type(); + if (replacements.count(openGLOp) > 0) { + openGLOp = replacements[openGLOp]; + } + + if (opKeySet.find(openGLOp) != opKeySet.end()) { + op->set_type(openGLOp); + openGLOps.insert(openGLOp); + + if (useTiling) { + auto* arg = op->add_arg(); + arg->set_name("tiling"); + arg->set_i(1); + } + } else { + needCopyOps = true; + } + } + + if (useTextureInput && needCopyOps) { + CAFFE_THROW("OpenGL operator missing"); + } + + if (runFusion) { + net = runOpenGLFusion(net, openGLOps); + } + + if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) { + // For end-to-end testing + if (net.op(net.op_size() - 1).type() != + replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) { + auto* last_op = net.mutable_op(net.op_size() - 1); + auto output = last_op->output(0) + "M"; + last_op->set_output(0, output); + auto* copy_op = net.add_op(); + copy_op->set_name("CopyFromOpenGL"); + copy_op->set_type("CopyFromOpenGL"); + copy_op->add_input(output); + // rename output blob in case input and output blob has the same name + copy_op->add_output(net.external_output(0)); + } + } else { + if (!useTextureInput) { + needCopyOps = true; + } + } + + // copy ops are needed when the input is not a texture + if (needCopyOps) { + // For non style transfer cases + net = insertInputOutputCopyOps(net, openGLOps); + } + + return net; +} + +bool tryConvertToOpenGL(const NetDef& initNet, + const NetDef& predictNet, + NetDef* glPredictNet, + bool useTextureInput, + bool useTiling, + bool runFusion) { + try { + // Throws if unsupported operators are found. + *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion); + dumpDefForOpenGL(*glPredictNet); + // Throws if unsupported parameters are found. + Workspace ws; + ws.RunNetOnce(initNet); + ws.CreateNet(*glPredictNet); + LOG(INFO) << "OpenGL is successfully enabled"; + return true; + } catch (const std::exception& e) { + LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what(); + return false; + } +} +} // namespace caffe2 diff --git a/contrib/opengl/core/rewrite_net.cc b/contrib/opengl/core/rewrite_net.cc new file mode 100644 index 0000000000..c69cb7b947 --- /dev/null +++ b/contrib/opengl/core/rewrite_net.cc @@ -0,0 +1,368 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "rewrite_net.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/proto_utils.h" +#include <unordered_map> +#include <unordered_set> + +#ifdef CAFFE2_ANDROID +#include "../android/AndroidGLContext.h" +#endif + +namespace caffe2 { + +struct Analysis { + struct SSA { + using BlobVersions = std::unordered_map<std::string, size_t>; + BlobVersions inVersions; + BlobVersions outVersions; + }; + std::vector<SSA> ssa; + std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages; +}; + +static Analysis analyzeNet(const NetDef& net) { + Analysis::SSA::BlobVersions frontier; + Analysis analysis; + + auto play = [&](size_t i, const OperatorDef& op) { + Analysis::SSA::BlobVersions inVersions; + for (const auto& s : op.input()) { + inVersions[s] = frontier[s]; + analysis.inUsages[s][frontier[s]].push_back(i); + } + Analysis::SSA::BlobVersions outVersions; + for (const auto& s : op.output()) { + if (frontier.find(s) != frontier.end()) { + frontier[s] += 1; + } + outVersions[s] = frontier[s]; + } + analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions}); + }; + + for (auto i = 0; i < net.op_size(); ++i) { + play(i, net.op(i)); + } + return analysis; +} + +static void insertCopyToGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + auto* op = predictNet.add_op(); + op->set_name("CopyToOpenGL"); + op->set_type("CopyToOpenGL"); + op->add_input(cpu_blob); + op->add_output(cpu_blob + "_M"); +} + +static void insertCopyFromGPUOp(NetDef& predictNet, const std::string& cpu_blob) { + // add argument "is_last" to the last op to signal this is the last operator before the + // CopyFromOpenGL op + auto* last_op = predictNet.mutable_op(predictNet.op_size() - 1); + auto* arg = last_op->add_arg(); + arg->set_name("is_last"); + arg->set_i(1); + + auto* op = predictNet.add_op(); + op->set_name("CopyFromOpenGL"); + op->set_type("CopyFromOpenGL"); + op->add_input(cpu_blob + "_M"); + op->add_output(cpu_blob); +} + +static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& glOps) { + // Do some validation of the outputs. For this version, we require: + // - a single input (first element of external_input()) is consumed by the NetDef + // - a single output (first element of external_output()) is produced by the NetDef. + // - the input is consumed by def.op(0), and this is the only consumer. + // - the output is produced by def.op(-1). + CAFFE_ENFORCE_GE(def.external_input_size(), 1); + CAFFE_ENFORCE_GE(def.external_output_size(), 1); + auto analysis = analyzeNet(def); + // enforce a single use of the input blob. + CAFFE_ENFORCE_GE(def.op_size(), 1); + + const auto& inputBlob = def.external_input(0); + // Enforce that the input blob has a single usage - in the first operator. + CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0})); + // Enforce that the external_output(0) blob is produced by the last operator in this sequence. + const auto& outputBlob = def.external_output(0); + CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != + analysis.ssa.back().outVersions.end()); + const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob]; + // This should hold true by definition of the SSA analysis. + CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == + analysis.inUsages[outputBlob].end()); + + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + + std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs; + cpu_blobs[def.external_input(0)].insert(0); + + for (auto i = 0; i < def.op_size(); i++) { + const auto& currentOp = def.op(i); + if (glOps.count(currentOp.type()) > 0) { + // OpenGL Op + // insert copyToOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (cpu_blobs[input].count(version) > 0) { + insertCopyToGPUOp(mdef, input); + gpu_blobs[input].insert(version); + cpu_blobs[input].erase(version); + } + // Only the first input should be OpenGL texture + // Otherwise, copyToOpenGLOp will be inserted for the weights, + // which are outputs of QuantDecode + if (currentOp.type().find("OpenGLConv") == 0) { + if (j == 0) { + break; + } + } + } + + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + + // swap input blob + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + op->set_input(j, input + "_M"); + } + } + + // swap output blob + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + op->set_output(j, output + "_M"); + gpu_blobs[output].insert(version); + } + // insert copyFromOpenGLOp after the last op if the last op is an OpenGL op + if (i == def.op_size() - 1) { + insertCopyFromGPUOp(mdef, currentOp.output(0)); + } + } else { + // CPU Op + // insert copyFromOpenGLOp + for (auto j = 0; j < currentOp.input_size(); j++) { + auto& input = currentOp.input(j); + auto version = analysis.ssa[i].inVersions[input]; + if (gpu_blobs[input].count(version) > 0) { + insertCopyFromGPUOp(mdef, input); + } + } + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + for (auto j = 0; j < currentOp.output_size(); j++) { + auto& output = currentOp.output(j); + auto version = analysis.ssa[i].outVersions[output]; + cpu_blobs[output].insert(version); + } + } + } + return mdef; +} + +static bool tryFuseAdjacentOps(const OperatorDef& currentOp, + const OperatorDef& nextOp, + OperatorDef* fusedOp, + std::unordered_set<std::string>& glOps) { + // Check for possible invalid opportunities. + if (currentOp.output_size() != 1 || nextOp.output_size() != 1) { + return false; + } + // The fused op cannot be inplace + if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) { + return false; + } + + static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = { + {{"OpenGLInstanceNorm", "OpenGLPRelu"}, "OpenGLInstanceNormPRelu"}, + {{"OpenGLConv", "OpenGLPRelu"}, "OpenGLConvPRelu"}, + {{"OpenGLConv", "OpenGLRelu"}, "OpenGLConvRelu"}, + {{"OpenGLConvTranspose", "OpenGLPRelu"}, "OpenGLConvTransposePRelu"}}; + auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()}); + if (it == fusionOpportunities.end()) { + return false; + } + + glOps.insert(it->second); + fusedOp->CopyFrom(currentOp); + fusedOp->set_output(0, nextOp.output(0)); + fusedOp->set_type(it->second); + for (auto i = 1; i < nextOp.input_size(); i++) { + fusedOp->add_input(nextOp.input(i)); + } + return true; +} + +static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) { + CHECK_GE(def.op_size(), 1); + NetDef mdef; + mdef.CopyFrom(def); + mdef.clear_op(); + auto i = 0; + + while (i < def.op_size()) { + if (i == def.op_size() - 1) { + VLOG(2) << "Last operator, skipping"; + auto* op = mdef.add_op(); + op->CopyFrom(def.op(i)); + i += 1; + continue; + } + + const auto& currentOp = def.op(i); + const auto& nextOp = def.op(i + 1); + OperatorDef fusedOp; + if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) { + VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type(); + // We can fuse. + auto* op = mdef.add_op(); + op->CopyFrom(fusedOp); + i += 2; + continue; + } + VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type(); + // Just emit the current type. + auto* op = mdef.add_op(); + op->CopyFrom(currentOp); + i += 1; + } + return mdef; +} + +void dumpDefForOpenGL(const NetDef& d) { + for (const auto& op : d.op()) { + LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0); + } +} + +// // For debugging +// void dumpDefForOpenGL(const NetDef &net) { +// for (const auto &op : net.op()) { +// printf("***Operator: %s\n", op.type().c_str()); +// for (auto input : op.input()) { +// printf("\tInput: %s\n", input.c_str()); +// } +// +// for (auto output : op.output()) { +// printf("\tOutput: %s\n", output.c_str()); +// } +// } +//} + +NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool useTextureInput, bool useTiling, bool runFusion) { + CAFFE_ENFORCE_GE(predictNet.op_size(), 1); + NetDef net; + net.CopyFrom(predictNet); + + std::unordered_map<std::string, std::string> replacements( + {{"OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerPreprocess" + : "OpenGLTensorToTextureStylizerPreprocess"}, + {"OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess", + useTextureInput ? "OpenGLTextureToTextureStylizerDeprocess" + : "OpenGLTextureToTensorStylizerDeprocess"}}); + + std::unordered_set<std::string> openGLOps; // Used to insert copy ops + bool needCopyOps = false; + + const auto& opKeyList = CPUOperatorRegistry()->Keys(); + auto opKeySet = std::set<std::string>(opKeyList.begin(), opKeyList.end()); + +#ifdef CAFFE2_ANDROID + // TODO: debug InstanceNorm models on Mali devices + AndroidGLContext* context = (AndroidGLContext*)GLContext::getGLContext(); + if (context->get_platform() == Mali) { + opKeySet.erase("OpenGLInstanceNorm"); + opKeySet.erase("OpenGLInstanceNormPRelu"); + } +#endif + for (auto i = 0; i < net.op_size(); ++i) { + auto* op = net.mutable_op(i); + string openGLOp = std::string("OpenGL") + op->type(); + if (replacements.count(openGLOp) > 0) { + openGLOp = replacements[openGLOp]; + } + + if (opKeySet.find(openGLOp) != opKeySet.end()) { + op->set_type(openGLOp); + openGLOps.insert(openGLOp); + + if (useTiling) { + auto* arg = op->add_arg(); + arg->set_name("tiling"); + arg->set_i(1); + } + } else { + needCopyOps = true; + } + } + + if (useTextureInput && needCopyOps) { + CAFFE_THROW("OpenGL operator missing"); + } + + if (runFusion) { + net = runOpenGLFusion(net, openGLOps); + } + + if (net.op(0).type() == replacements["OpenGLPackedInt8BGRANHWCToNCHWCStylizerPreprocess"]) { + // For end-to-end testing + if (net.op(net.op_size() - 1).type() != + replacements["OpenGLBRGNCHWCToPackedInt8BGRAStylizerDeprocess"]) { + auto* last_op = net.mutable_op(net.op_size() - 1); + auto output = last_op->output(0) + "M"; + last_op->set_output(0, output); + auto* copy_op = net.add_op(); + copy_op->set_name("CopyFromOpenGL"); + copy_op->set_type("CopyFromOpenGL"); + copy_op->add_input(output); + // rename output blob in case input and output blob has the same name + copy_op->add_output(net.external_output(0)); + } + } else { + if (!useTextureInput) { + needCopyOps = true; + } + } + + // copy ops are needed when the input is not a texture + if (needCopyOps) { + // For non style transfer cases + net = insertInputOutputCopyOps(net, openGLOps); + } + + return net; +} + +bool tryConvertToOpenGL(const NetDef& initNet, + const NetDef& predictNet, + NetDef* glPredictNet, + bool useTextureInput, + bool useTiling, + bool runFusion) { + try { + // Throws if unsupported operators are found. + *glPredictNet = rewritePredictNetForOpenGL(predictNet, useTextureInput, useTiling, runFusion); + dumpDefForOpenGL(*glPredictNet); + // Throws if unsupported parameters are found. + Workspace ws; + ws.RunNetOnce(initNet); + ws.CreateNet(*glPredictNet); + LOG(INFO) << "OpenGL is successfully enabled"; + return true; + } catch (const std::exception& e) { + LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what(); + return false; + } +} +} // namespace caffe2 |