diff options
Diffstat (limited to 'inference-engine/src/gna_plugin/gna_plugin.hpp')
-rw-r--r-- | inference-engine/src/gna_plugin/gna_plugin.hpp | 488 |
1 files changed, 488 insertions, 0 deletions
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp new file mode 100644 index 000000000..53365d7a6 --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_plugin.hpp @@ -0,0 +1,488 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpp_interfaces/base/ie_plugin_base.hpp" +#include "dnn.h" +#include "gna_memory.hpp" +#include "gna_device.hpp" +#include <map> +#include <list> +#include <string> +#include <utility> +#include <memory> +#include <vector> +#include <tuple> +#include <gna-api-status.h> +#include <gna-api.h> +#include <cpp_interfaces/interface/ie_iplugin_internal.hpp> +#include <cpp_interfaces/impl/ie_plugin_internal.hpp> +#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp> +#include <graph_tools.hpp> +#include "gna_allocator.hpp" +#include "gna_api_wrapper.hpp" + +namespace GNAPluginNS { + +void ConvertToInt16(int16_t *ptr_dst, + const float *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); +void ConvertToFloat(float *ptr_dst, + int32_t *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); + +int16_t ConvertFloatToInt16(float src); + +class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> { + protected: + AmIntelDnn dnn; + using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>; + + /** + * @brief - copy of nnet structure and indicator that related infer request not yet synced + */ + std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets; + + intel_dnn_orientation_t orientation_in = kDnnUnknownOrientation; + intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation; + double input_scale_factor = 1.0; + double output_scale_factor = 1.0; + uint32_t num_rotate_rows = 0; + uint32_t num_rotate_columns = 0; + + + uint32_t num_feature_maps = 1; + uint32_t num_memory_bytes; + + std::vector<void *> ptr_inputs_global; + std::vector<void *> ptr_outputs_global; + + int16_t *ptr_int_inputs = NULL; + int32_t *ptr_int_outputs = NULL; + uint32_t *ptr_active_indices = NULL; + uint32_t num_active_indices = 0; + uint32_t num_group_in = 0; + uint32_t num_bytes_weight; + uint32_t num_bytes_per_output = 0; + + bool use_dynamic_quantization = false; + bool compact_mode = true; + bool exclusive_async_requests = false; + bool uniformPwlDesign = false; + uint8_t gna_lib_async_threads_num = 1; + bool gna_openmp_multithreading = false; + // precision of GNA hardware model + InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16; + + bool performance_counting = false; + int bytes_alllocated_for_input = 0; + intel_dnn_number_type_t output_type = kDnnInt; + std::string utterance_name; + + // internal types + enum LayerType { + Input, + Convolution, + ReLU, + LeakyReLU, + Sigmoid, + TanH, + Activation, + Pooling, + FullyConnected, + InnerProduct, + Reshape, + Split, + Slice, + Eltwise, + ScaleShift, + Clamp, + Concat, + Copy, + Permute, + Memory, + Power, + Crop, + NO_TYPE + }; + + public: + explicit GNAPlugin(const std::map<std::string, std::string>& configMap); + /** + * @brief construct from aot rather then from cnn network + */ + GNAPlugin() = default; + + void LoadNetwork(InferenceEngine::ICNNNetwork &network) override; + using InferenceEngine::IInferencePluginInternal::Infer; + + void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override; + void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override; + void AddExtension(InferenceEngine::IExtensionPtr extension) override; + void SetConfig(const std::map<std::string, std::string> &config) override; + void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork, + InferenceEngine::ICNNNetwork &network, + const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; } + void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override; + void SetLogCallback(InferenceEngine::IErrorListener &listener) override {}; + void Reset(); + /** + * @deprecated Use the version with config parameter + */ + void QueryNetwork(const InferenceEngine::ICNNNetwork &network, + InferenceEngine::QueryNetworkResult &res) const override { } + void QueryNetwork(const InferenceEngine::ICNNNetwork &network, + const std::map<std::string, std::string>& config, + InferenceEngine::QueryNetworkResult &res) const override { } + uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result); + void Wait(uint32_t idx = 0); + + uint32_t QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result); + /** + * + * @param sync - points to gna sync point + * @param idx - points to + * @param result + */ + void Wait(uint32_t sync, InferenceEngine::Blob &result); + + void Export(const std::string &fileName); + InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName + , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; } + InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName); + + + bool IsExclusiveAsyncRequests() { return exclusive_async_requests; } + + /** + * utility to provide input and output blobs externally to be used by InferenceEngine request API clients + */ + InferenceEngine::Blob::Ptr GetInputBlob(InferenceEngine::Precision precision); + InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision); + /** + * helpers to provide inputs info on AOT network + */ + InferenceEngine::InputsDataMap GetInputs() {return inputsDataMap;} + InferenceEngine::OutputsDataMap GetOutputs() {return outputsDataMap;} + /** + * QueryState API + * @return + */ + std::vector<InferenceEngine::IMemoryStateInternal::Ptr> QueryState(); + + protected: + uint32_t num_cnn_rows_out = 0; + bool done = false; + std::string dumpXNNPath; + intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE); + + void DumpXNNToFile() const; + void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr); + void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false); + void DiagonalPrimitive(InferenceEngine::CNNLayerPtr); + void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr); + void PermutePrimitive(InferenceEngine::CNNLayerPtr); + void PoolingPrimitive(InferenceEngine::CNNLayerPtr); + void PowerPrimitive(InferenceEngine::CNNLayerPtr); + void ConcatPrimitive(InferenceEngine::CNNLayerPtr); + void CropPrimitive(InferenceEngine::CNNLayerPtr); + void EltwisePrimitive(InferenceEngine::CNNLayerPtr); + void SplitPrimitive(InferenceEngine::CNNLayerPtr); + void SlicePrimitive(InferenceEngine::CNNLayerPtr); + void PWLPrimitive(InferenceEngine::CNNLayerPtr); + void CopyPrimitive(InferenceEngine::CNNLayerPtr); + bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage); + LayerType LayerTypeFromStr(std::string const &str); + /** + * maps tpe of connection to input and output layers also stores gna_pointer for alloc request + */ + class GNAMemoryLayer { + InferenceEngine::CNNLayerPtr inputLayer; + InferenceEngine::CNNLayerPtr outputLayer; + public: + GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) : + inputLayer(inLayer), outputLayer(outLayer) { + } + + InferenceEngine::CNNLayerPtr getInput() { return inputLayer; } + InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; } + + /** + * pointer to gna memory request + */ + void *gna_ptr = nullptr; + /** + * gna memory of this size is reserved + */ + size_t reserved_size = 0; + /** + * gna memory of this offset from gna_ptr + */ + size_t reserved_offset = 0; + }; + + class GNAConcatLayer { + InferenceEngine::CNNLayerPtr concatLayer; + + public: + explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) : + concatLayer(layer) + {} + + InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; } + /** + * pointer to gna memory request + */ + void *gna_ptr = nullptr; + /** + * gna memory of this size is reserved for concat + */ + size_t reserved_size = 0; + bool output_allocation_flag = false; + /** + * gna memory of this offset from gna_ptr + */ + struct ConcatConnectedLayerInfo { + ConcatConnectedLayerInfo(const std::string& n, + size_t o) : + name(n), + offset(o) {} + std::string name = ""; + size_t offset = 0; + }; + + std::vector<ConcatConnectedLayerInfo> concatInputLayers; + }; + + // Split, Slice + class GNASplitLayer { + InferenceEngine::CNNLayerPtr splitLayer; + + public: + explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) : + splitLayer(layer), + splitInputLayer() + {} + + InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; } + /** + * gna memory of this size is reserved for concat + */ + size_t reserved_size = 0; + bool output_allocation_flag = false; + /** + * gna memory of this offset from gna_ptr + */ + struct SplitConnectedLayerInfo { + SplitConnectedLayerInfo() {} + SplitConnectedLayerInfo(std::string& n, + size_t o, + size_t p) : + name(n), + offset(o), + pure_size(p) {} + + SplitConnectedLayerInfo& operator= + (SplitConnectedLayerInfo const& layerInfo) { + this->name = layerInfo.name; + this->offset = layerInfo.offset; + this->pure_size = layerInfo.pure_size; + return *this; + } + std::string name = ""; + size_t offset = 0; + size_t pure_size = 0; + }; + SplitConnectedLayerInfo splitInputLayer; + std::vector<SplitConnectedLayerInfo> splitOutputLayers; + }; + + class GNACropLayer { + InferenceEngine::CNNLayerPtr cropLayer; + + public: + explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) : + cropLayer(layer) + {} + + InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; } + /** + * pointer to gna croped memory beginning + */ + void *gna_ptr = nullptr; + }; + using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>; + using ConcatConnection = std::map<std::string, GNAConcatLayer>; + using SplitConnection = std::map<std::string, GNASplitLayer>; + using CropConnection = std::map<std::string, GNACropLayer>; + // layers with extra storage for connections and additional + // non trivial processing + MemoryConnection memory_connection; + ConcatConnection concat_connection; + SplitConnection split_connection; + CropConnection crop_connection; + void fillMemoryConnections(std::map<std::string, + std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs); + + void fillConcatConnections(InferenceEngine::CNNLayerPtr layer); + void fillSplitConnections(InferenceEngine::CNNLayerPtr layer); + /** + * maps layer name to dnn.component, in topological sort prev nodes will be initialized + */ + using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>; + std::list<std::pair<std::string, intel_dnn_component_t>> dnnComponentsForLayer; + + /** + * @brief returns corresponding dnn layer for topology layer + * @param __layer + * @return + */ + intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer); + + using allocator_type = PolymorphAllocator<uint8_t>; + using gna_memory_type = GNAMemory<allocator_type>; + + std::unique_ptr<GNADeviceHelper> gnadevice; + /** + * @brief size of RW segment without extra memory for parallel execution + */ + uint32_t rwSegmentSize = 0; + std::unique_ptr<gna_memory_type> gnamem; + + /** + * Connects either memory output, or generic output to a layer + * @param layer - layer pointer + * @param ptr - pointer to pointer where to store output layer information + * @param sz - sizeof output blob + * @param ptr_inputs - sizeof output blob + */ + void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz); + /** + * Connects certain input to this layer + * @param layer - layer that we connect input to + * @param pVoid - pointer that holds current layer pointer in gna_mem request + * @param num_data_bytes_in - size + * @param offset - num bytes to advance in buffer + * @param idx - index of input port that we are connecting + * @return layer used as input + */ + struct ConnectionDetails { + InferenceEngine::CNNLayerPtr input; + bool needTransposeWeights = false; + InferenceEngine::CNNLayerPtr permute; + ConnectionDetails(InferenceEngine::CNNLayerPtr input, + bool bTranspose = false, + InferenceEngine::CNNLayerPtr permute = nullptr) + : input(input) + , needTransposeWeights(bTranspose) + , permute(permute) { + } + }; + ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer, + void *pVoid, + size_t num_data_bytes_in, + size_t offset = 0, + int idx = 0); + + void ImportFrames(void *ptr_dst, + const void *ptr_src, + InferenceEngine::Precision input_precision, + intel_dnn_orientation_t orientation, + uint32_t num_frames, + uint32_t num_group, + uint32_t num_vector_elements, + uint32_t num_vector_stride); + + void ExportScores(void *ptr_dst, + void *ptr_src, + intel_dnn_orientation_t orientation, + uint32_t num_frames, + uint32_t num_group, + uint32_t num_vector_elements, + uint32_t num_active_elements, + uint32_t num_vector_stride, + uint32_t num_bytes_per_element_input, + uint32_t num_bytes_per_element); + + friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst, + const float *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); + friend void GNAPluginNS::ConvertToFloat(float *ptr_dst, + int32_t *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); + + friend int16_t GNAPluginNS::ConvertFloatToInt16(float src); + + template <typename T, typename U> + void copyInputData(T *dst, + const U *src, + uint32_t num_frames, + uint32_t num_group, + uint32_t num_vector_elements, + uint32_t num_vector_stride, + intel_dnn_orientation_t orientation); + + template <typename T, typename U> + void copyInputDataWithSplit(T *const dst, + const U *src, + const GNASplitLayer& splitInfo, + size_t precision_size); + /** + * @brief GNA affine layers are always have activation atatched, while IR not + * @param net - copied net ready for quantisation + */ + void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers); + + /** + * @brief GNA convolution layers have deinterleaved oriantations, while affine one doesn't + * so between convolution and affine layers permute layers need to be inserted, + * or removed if they are present in topology + * @param layers + */ + void applyOrientations(std::vector<InferenceEngine::CNNLayerPtr> &layers); + + + /** + * brief @search for specific patter in the graph (6 layers are replaced by single one) + * @param layers + */ + void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers); + + std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer); + + /** + * diagonal layer insertion required in cases where activation followed by split layers, or any other + * topology changing layers + */ + void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers); + + /** + * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation + * it means maxpool receives 4 bytes, and produces 4 bytes + */ + void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers); + + /** + * copy layer insertion required in cases where input layer does not have output memory + */ + void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers); + + intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current); + + InferenceEngine::SizeVector inputDims; + InferenceEngine::InputsDataMap inputsDataMap; + + InferenceEngine::SizeVector outputDims; + InferenceEngine::OutputsDataMap outputsDataMap; +}; +} // namespace GNAPluginNS |