summaryrefslogtreecommitdiff
path: root/inference-engine/src/gna_plugin/gna_plugin.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'inference-engine/src/gna_plugin/gna_plugin.hpp')
-rw-r--r--inference-engine/src/gna_plugin/gna_plugin.hpp488
1 files changed, 488 insertions, 0 deletions
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp
new file mode 100644
index 000000000..53365d7a6
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -0,0 +1,488 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpp_interfaces/base/ie_plugin_base.hpp"
+#include "dnn.h"
+#include "gna_memory.hpp"
+#include "gna_device.hpp"
+#include <map>
+#include <list>
+#include <string>
+#include <utility>
+#include <memory>
+#include <vector>
+#include <tuple>
+#include <gna-api-status.h>
+#include <gna-api.h>
+#include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include <graph_tools.hpp>
+#include "gna_allocator.hpp"
+#include "gna_api_wrapper.hpp"
+
+namespace GNAPluginNS {
+
+void ConvertToInt16(int16_t *ptr_dst,
+ const float *ptr_src,
+ const uint32_t num_rows,
+ const uint32_t num_columns,
+ const float scale_factor);
+void ConvertToFloat(float *ptr_dst,
+ int32_t *ptr_src,
+ const uint32_t num_rows,
+ const uint32_t num_columns,
+ const float scale_factor);
+
+int16_t ConvertFloatToInt16(float src);
+
+class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
+ protected:
+ AmIntelDnn dnn;
+ using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>;
+
+ /**
+ * @brief - copy of nnet structure and indicator that related infer request not yet synced
+ */
+ std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets;
+
+ intel_dnn_orientation_t orientation_in = kDnnUnknownOrientation;
+ intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation;
+ double input_scale_factor = 1.0;
+ double output_scale_factor = 1.0;
+ uint32_t num_rotate_rows = 0;
+ uint32_t num_rotate_columns = 0;
+
+
+ uint32_t num_feature_maps = 1;
+ uint32_t num_memory_bytes;
+
+ std::vector<void *> ptr_inputs_global;
+ std::vector<void *> ptr_outputs_global;
+
+ int16_t *ptr_int_inputs = NULL;
+ int32_t *ptr_int_outputs = NULL;
+ uint32_t *ptr_active_indices = NULL;
+ uint32_t num_active_indices = 0;
+ uint32_t num_group_in = 0;
+ uint32_t num_bytes_weight;
+ uint32_t num_bytes_per_output = 0;
+
+ bool use_dynamic_quantization = false;
+ bool compact_mode = true;
+ bool exclusive_async_requests = false;
+ bool uniformPwlDesign = false;
+ uint8_t gna_lib_async_threads_num = 1;
+ bool gna_openmp_multithreading = false;
+ // precision of GNA hardware model
+ InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
+
+ bool performance_counting = false;
+ int bytes_alllocated_for_input = 0;
+ intel_dnn_number_type_t output_type = kDnnInt;
+ std::string utterance_name;
+
+ // internal types
+ enum LayerType {
+ Input,
+ Convolution,
+ ReLU,
+ LeakyReLU,
+ Sigmoid,
+ TanH,
+ Activation,
+ Pooling,
+ FullyConnected,
+ InnerProduct,
+ Reshape,
+ Split,
+ Slice,
+ Eltwise,
+ ScaleShift,
+ Clamp,
+ Concat,
+ Copy,
+ Permute,
+ Memory,
+ Power,
+ Crop,
+ NO_TYPE
+ };
+
+ public:
+ explicit GNAPlugin(const std::map<std::string, std::string>& configMap);
+ /**
+ * @brief construct from aot rather then from cnn network
+ */
+ GNAPlugin() = default;
+
+ void LoadNetwork(InferenceEngine::ICNNNetwork &network) override;
+ using InferenceEngine::IInferencePluginInternal::Infer;
+
+ void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override;
+ void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override;
+ void AddExtension(InferenceEngine::IExtensionPtr extension) override;
+ void SetConfig(const std::map<std::string, std::string> &config) override;
+ void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
+ InferenceEngine::ICNNNetwork &network,
+ const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+ void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override;
+ void SetLogCallback(InferenceEngine::IErrorListener &listener) override {};
+ void Reset();
+ /**
+ * @deprecated Use the version with config parameter
+ */
+ void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
+ InferenceEngine::QueryNetworkResult &res) const override { }
+ void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
+ const std::map<std::string, std::string>& config,
+ InferenceEngine::QueryNetworkResult &res) const override { }
+ uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
+ void Wait(uint32_t idx = 0);
+
+ uint32_t QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result);
+ /**
+ *
+ * @param sync - points to gna sync point
+ * @param idx - points to
+ * @param result
+ */
+ void Wait(uint32_t sync, InferenceEngine::Blob &result);
+
+ void Export(const std::string &fileName);
+ InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName
+ , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+ InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName);
+
+
+ bool IsExclusiveAsyncRequests() { return exclusive_async_requests; }
+
+ /**
+ * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
+ */
+ InferenceEngine::Blob::Ptr GetInputBlob(InferenceEngine::Precision precision);
+ InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision);
+ /**
+ * helpers to provide inputs info on AOT network
+ */
+ InferenceEngine::InputsDataMap GetInputs() {return inputsDataMap;}
+ InferenceEngine::OutputsDataMap GetOutputs() {return outputsDataMap;}
+ /**
+ * QueryState API
+ * @return
+ */
+ std::vector<InferenceEngine::IMemoryStateInternal::Ptr> QueryState();
+
+ protected:
+ uint32_t num_cnn_rows_out = 0;
+ bool done = false;
+ std::string dumpXNNPath;
+ intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
+
+ void DumpXNNToFile() const;
+ void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
+ void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
+ void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
+ void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr);
+ void PermutePrimitive(InferenceEngine::CNNLayerPtr);
+ void PoolingPrimitive(InferenceEngine::CNNLayerPtr);
+ void PowerPrimitive(InferenceEngine::CNNLayerPtr);
+ void ConcatPrimitive(InferenceEngine::CNNLayerPtr);
+ void CropPrimitive(InferenceEngine::CNNLayerPtr);
+ void EltwisePrimitive(InferenceEngine::CNNLayerPtr);
+ void SplitPrimitive(InferenceEngine::CNNLayerPtr);
+ void SlicePrimitive(InferenceEngine::CNNLayerPtr);
+ void PWLPrimitive(InferenceEngine::CNNLayerPtr);
+ void CopyPrimitive(InferenceEngine::CNNLayerPtr);
+ bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage);
+ LayerType LayerTypeFromStr(std::string const &str);
+ /**
+ * maps tpe of connection to input and output layers also stores gna_pointer for alloc request
+ */
+ class GNAMemoryLayer {
+ InferenceEngine::CNNLayerPtr inputLayer;
+ InferenceEngine::CNNLayerPtr outputLayer;
+ public:
+ GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) :
+ inputLayer(inLayer), outputLayer(outLayer) {
+ }
+
+ InferenceEngine::CNNLayerPtr getInput() { return inputLayer; }
+ InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; }
+
+ /**
+ * pointer to gna memory request
+ */
+ void *gna_ptr = nullptr;
+ /**
+ * gna memory of this size is reserved
+ */
+ size_t reserved_size = 0;
+ /**
+ * gna memory of this offset from gna_ptr
+ */
+ size_t reserved_offset = 0;
+ };
+
+ class GNAConcatLayer {
+ InferenceEngine::CNNLayerPtr concatLayer;
+
+ public:
+ explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) :
+ concatLayer(layer)
+ {}
+
+ InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; }
+ /**
+ * pointer to gna memory request
+ */
+ void *gna_ptr = nullptr;
+ /**
+ * gna memory of this size is reserved for concat
+ */
+ size_t reserved_size = 0;
+ bool output_allocation_flag = false;
+ /**
+ * gna memory of this offset from gna_ptr
+ */
+ struct ConcatConnectedLayerInfo {
+ ConcatConnectedLayerInfo(const std::string& n,
+ size_t o) :
+ name(n),
+ offset(o) {}
+ std::string name = "";
+ size_t offset = 0;
+ };
+
+ std::vector<ConcatConnectedLayerInfo> concatInputLayers;
+ };
+
+ // Split, Slice
+ class GNASplitLayer {
+ InferenceEngine::CNNLayerPtr splitLayer;
+
+ public:
+ explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) :
+ splitLayer(layer),
+ splitInputLayer()
+ {}
+
+ InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; }
+ /**
+ * gna memory of this size is reserved for concat
+ */
+ size_t reserved_size = 0;
+ bool output_allocation_flag = false;
+ /**
+ * gna memory of this offset from gna_ptr
+ */
+ struct SplitConnectedLayerInfo {
+ SplitConnectedLayerInfo() {}
+ SplitConnectedLayerInfo(std::string& n,
+ size_t o,
+ size_t p) :
+ name(n),
+ offset(o),
+ pure_size(p) {}
+
+ SplitConnectedLayerInfo& operator=
+ (SplitConnectedLayerInfo const& layerInfo) {
+ this->name = layerInfo.name;
+ this->offset = layerInfo.offset;
+ this->pure_size = layerInfo.pure_size;
+ return *this;
+ }
+ std::string name = "";
+ size_t offset = 0;
+ size_t pure_size = 0;
+ };
+ SplitConnectedLayerInfo splitInputLayer;
+ std::vector<SplitConnectedLayerInfo> splitOutputLayers;
+ };
+
+ class GNACropLayer {
+ InferenceEngine::CNNLayerPtr cropLayer;
+
+ public:
+ explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) :
+ cropLayer(layer)
+ {}
+
+ InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; }
+ /**
+ * pointer to gna croped memory beginning
+ */
+ void *gna_ptr = nullptr;
+ };
+ using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>;
+ using ConcatConnection = std::map<std::string, GNAConcatLayer>;
+ using SplitConnection = std::map<std::string, GNASplitLayer>;
+ using CropConnection = std::map<std::string, GNACropLayer>;
+ // layers with extra storage for connections and additional
+ // non trivial processing
+ MemoryConnection memory_connection;
+ ConcatConnection concat_connection;
+ SplitConnection split_connection;
+ CropConnection crop_connection;
+ void fillMemoryConnections(std::map<std::string,
+ std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
+
+ void fillConcatConnections(InferenceEngine::CNNLayerPtr layer);
+ void fillSplitConnections(InferenceEngine::CNNLayerPtr layer);
+ /**
+ * maps layer name to dnn.component, in topological sort prev nodes will be initialized
+ */
+ using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>;
+ std::list<std::pair<std::string, intel_dnn_component_t>> dnnComponentsForLayer;
+
+ /**
+ * @brief returns corresponding dnn layer for topology layer
+ * @param __layer
+ * @return
+ */
+ intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer);
+
+ using allocator_type = PolymorphAllocator<uint8_t>;
+ using gna_memory_type = GNAMemory<allocator_type>;
+
+ std::unique_ptr<GNADeviceHelper> gnadevice;
+ /**
+ * @brief size of RW segment without extra memory for parallel execution
+ */
+ uint32_t rwSegmentSize = 0;
+ std::unique_ptr<gna_memory_type> gnamem;
+
+ /**
+ * Connects either memory output, or generic output to a layer
+ * @param layer - layer pointer
+ * @param ptr - pointer to pointer where to store output layer information
+ * @param sz - sizeof output blob
+ * @param ptr_inputs - sizeof output blob
+ */
+ void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz);
+ /**
+ * Connects certain input to this layer
+ * @param layer - layer that we connect input to
+ * @param pVoid - pointer that holds current layer pointer in gna_mem request
+ * @param num_data_bytes_in - size
+ * @param offset - num bytes to advance in buffer
+ * @param idx - index of input port that we are connecting
+ * @return layer used as input
+ */
+ struct ConnectionDetails {
+ InferenceEngine::CNNLayerPtr input;
+ bool needTransposeWeights = false;
+ InferenceEngine::CNNLayerPtr permute;
+ ConnectionDetails(InferenceEngine::CNNLayerPtr input,
+ bool bTranspose = false,
+ InferenceEngine::CNNLayerPtr permute = nullptr)
+ : input(input)
+ , needTransposeWeights(bTranspose)
+ , permute(permute) {
+ }
+ };
+ ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer,
+ void *pVoid,
+ size_t num_data_bytes_in,
+ size_t offset = 0,
+ int idx = 0);
+
+ void ImportFrames(void *ptr_dst,
+ const void *ptr_src,
+ InferenceEngine::Precision input_precision,
+ intel_dnn_orientation_t orientation,
+ uint32_t num_frames,
+ uint32_t num_group,
+ uint32_t num_vector_elements,
+ uint32_t num_vector_stride);
+
+ void ExportScores(void *ptr_dst,
+ void *ptr_src,
+ intel_dnn_orientation_t orientation,
+ uint32_t num_frames,
+ uint32_t num_group,
+ uint32_t num_vector_elements,
+ uint32_t num_active_elements,
+ uint32_t num_vector_stride,
+ uint32_t num_bytes_per_element_input,
+ uint32_t num_bytes_per_element);
+
+ friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
+ const float *ptr_src,
+ const uint32_t num_rows,
+ const uint32_t num_columns,
+ const float scale_factor);
+ friend void GNAPluginNS::ConvertToFloat(float *ptr_dst,
+ int32_t *ptr_src,
+ const uint32_t num_rows,
+ const uint32_t num_columns,
+ const float scale_factor);
+
+ friend int16_t GNAPluginNS::ConvertFloatToInt16(float src);
+
+ template <typename T, typename U>
+ void copyInputData(T *dst,
+ const U *src,
+ uint32_t num_frames,
+ uint32_t num_group,
+ uint32_t num_vector_elements,
+ uint32_t num_vector_stride,
+ intel_dnn_orientation_t orientation);
+
+ template <typename T, typename U>
+ void copyInputDataWithSplit(T *const dst,
+ const U *src,
+ const GNASplitLayer& splitInfo,
+ size_t precision_size);
+ /**
+ * @brief GNA affine layers are always have activation atatched, while IR not
+ * @param net - copied net ready for quantisation
+ */
+ void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+ /**
+ * @brief GNA convolution layers have deinterleaved oriantations, while affine one doesn't
+ * so between convolution and affine layers permute layers need to be inserted,
+ * or removed if they are present in topology
+ * @param layers
+ */
+ void applyOrientations(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+
+ /**
+ * brief @search for specific patter in the graph (6 layers are replaced by single one)
+ * @param layers
+ */
+ void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+ std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer);
+
+ /**
+ * diagonal layer insertion required in cases where activation followed by split layers, or any other
+ * topology changing layers
+ */
+ void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+ /**
+ * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation
+ * it means maxpool receives 4 bytes, and produces 4 bytes
+ */
+ void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+ /**
+ * copy layer insertion required in cases where input layer does not have output memory
+ */
+ void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+ intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current);
+
+ InferenceEngine::SizeVector inputDims;
+ InferenceEngine::InputsDataMap inputsDataMap;
+
+ InferenceEngine::SizeVector outputDims;
+ InferenceEngine::OutputsDataMap outputsDataMap;
+};
+} // namespace GNAPluginNS