summaryrefslogtreecommitdiff
path: root/inference-engine/src/mkldnn_plugin
diff options
context:
space:
mode:
authorAlexey Suhov <asuhov@users.noreply.github.com>2019-01-21 21:31:31 +0300
committeropenvino-pushbot <44090433+openvino-pushbot@users.noreply.github.com>2019-01-21 21:31:31 +0300
commit9de27f16bc8b712a5b8c99d1d4b4a66c9144942d (patch)
tree01a383efe94d92b9870d513c2c5ea5d15b07010a /inference-engine/src/mkldnn_plugin
parentfbc7a4a710c24def8ab199926a7da90a0394b87d (diff)
downloaddldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.gz
dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.bz2
dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.zip
Publishing R5 content (#72)
* Publishing R5 content * Updated ade revision * updated readme * add possibility to build CPU plugin with Intel MKL package
Diffstat (limited to 'inference-engine/src/mkldnn_plugin')
-rw-r--r--inference-engine/src/mkldnn_plugin/CMakeLists.txt42
-rw-r--r--inference-engine/src/mkldnn_plugin/config.cpp50
-rw-r--r--inference-engine/src/mkldnn_plugin/config.h4
-rw-r--r--inference-engine/src/mkldnn_plugin/mean_image.cpp5
-rw-r--r--inference-engine/src/mkldnn_plugin/mean_image.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp5
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp2
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp47
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h20
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp227
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h80
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h32
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h57
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_dims.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_edge.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp598
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_graph.h57
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp207
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h18
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp100
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp148
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp176
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_memory.h2
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_node.cpp71
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_node.h18
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp14
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_plugin.h55
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_primitive.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp372
-rw-r--r--inference-engine/src/mkldnn_plugin/mkldnn_streams.h177
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp2
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp6
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp74
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp130
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h3
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp56
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp20
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h3
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp9
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp12
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp234
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h44
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp17
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h5
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp86
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp39
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp100
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h3
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp111
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h10
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp233
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h28
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp73
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h3
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp13
-rw-r--r--inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/perf_count.h1
-rw-r--r--inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp370
-rw-r--r--inference-engine/src/mkldnn_plugin/utils/blob_dump.h50
97 files changed, 2794 insertions, 1568 deletions
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
index 79551f636..5997f7d4b 100644
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -1,6 +1,7 @@
# Copyright (C) 2018 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
+
set(TARGET_NAME "MKLDNNPlugin")
if (UNIX AND NOT APPLE)
@@ -25,9 +26,7 @@ file(GLOB HEADERS
addVersionDefines(mkldnn_plugin.cpp CI_BUILD_NUMBER MKL_VERSION)
-if(WIN32)
- add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
-endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
include_directories(
${IE_MAIN_SOURCE_DIR}/include
@@ -38,39 +37,30 @@ include_directories(
${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include
)
+if (GEMM STREQUAL "MKL")
+ log_rpath_from_dir(MKL "${MKL}/lib")
+endif()
+
add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
+set_ie_threading_interface_for(${TARGET_NAME})
if (THREADING STREQUAL "TBB")
- target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB)
- target_include_directories(${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
- target_link_libraries(${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
+ set(MKLDNN_THR MKLDNN_THR_TBB)
elseif (THREADING STREQUAL "OMP")
- target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP)
- enable_omp()
- if(ENABLE_INTEL_OMP)
- target_link_libraries(${TARGET_NAME} ${intel_omp_lib})
- endif()
+ set(MKLDNN_THR MKLDNN_THR_OMP)
else()
- target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ)
+ set(MKLDNN_THR MKLDNN_THR_SEQ)
endif()
-target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} mkldnn)
+target_compile_definitions(${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
+target_link_libraries(${TARGET_NAME} PRIVATE inference_engine ${INTEL_ITT_LIBS} mkldnn)
+
set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
add_library(test_${TARGET_NAME} STATIC ${SOURCES} ${HEADERS})
+set_ie_threading_interface_for(test_${TARGET_NAME})
-if (THREADING STREQUAL "TBB")
- target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB)
- target_include_directories(test_${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
- target_link_libraries(test_${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
-elseif (THREADING STREQUAL "OMP")
- target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP)
- if(ENABLE_INTEL_OMP)
- target_link_libraries(test_${TARGET_NAME} ${intel_omp_lib})
- endif()
-else()
- target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ)
-endif()
+target_compile_definitions(test_${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
+target_link_libraries(test_${TARGET_NAME} PRIVATE inference_engine_s mkldnn)
-target_link_libraries(test_${TARGET_NAME} inference_engine_s mkldnn)
set_target_properties(test_${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME test_${TARGET_NAME})
diff --git a/inference-engine/src/mkldnn_plugin/config.cpp b/inference-engine/src/mkldnn_plugin/config.cpp
index 57c8dc9c8..4ef10eec2 100644
--- a/inference-engine/src/mkldnn_plugin/config.cpp
+++ b/inference-engine/src/mkldnn_plugin/config.cpp
@@ -1,16 +1,23 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
+// avoiding clash of the "max" macro with std::max
+#define NOMINMAX
+
#include "config.h"
#include "ie_plugin_config.hpp"
#include "ie_common.h"
#include <string>
+#include <cstring>
#include <map>
#include <algorithm>
+#include <stdexcept>
+
#include <cpp_interfaces/exception2status.hpp>
+#include <thread>
+#include "mkldnn/omp_manager.h"
namespace MKLDNNPlugin {
@@ -44,6 +51,42 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
else
THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS
<< ". Expected only YES/NO";
+ } else if (key == PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) {
+ if (val == PluginConfigParams::CPU_THROUGHPUT_NUMA) {
+ throughputStreams = MKLDNNPlugin::cpu::getNumberOfCPUSockets();
+ } else if (val == PluginConfigParams::CPU_THROUGHPUT_AUTO) {
+ // bare minimum of streams (that evenly divides available number of core)
+ const int num_cores = std::thread::hardware_concurrency();
+ if (0 == num_cores % 4)
+ throughputStreams = std::max(4, num_cores / 4);
+ else if (0 == num_cores % 5)
+ throughputStreams = std::max(5, num_cores / 5);
+ else if (0 == num_cores % 3)
+ throughputStreams = std::max(3, num_cores / 3);
+ else // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide
+ throughputStreams = 1;
+ } else {
+ int val_i;
+ try {
+ val_i = std::stoi(val);
+ } catch (const std::exception&) {
+ THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS
+ << ". Expected only positive numbers (#streams) or "
+ << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO";
+ }
+ if (val_i > 0)
+ throughputStreams = val_i;
+ }
+ } else if (key == PluginConfigParams::KEY_CPU_THREADS_NUM) {
+ int val_i;
+ try {
+ val_i = std::stoi(val);
+ } catch (const std::exception&) {
+ THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THREADS_NUM
+ << ". Expected only positive numbers (#threads)";
+ }
+ if (val_i > 0)
+ threadsNum = val_i;
} else if (key.compare(PluginConfigParams::KEY_DYN_BATCH_ENABLED) == 0) {
if (val.compare(PluginConfigParams::YES) == 0)
enableDynamicBatch = true;
@@ -52,10 +95,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
else
THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_DYN_BATCH_ENABLED
<< ". Expected only YES/NO";
+ } else if (key.compare(PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT) == 0) {
+ // empty string means that dumping is switched off
+ dumpToDot = val;
} else {
THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property " << key << " by CPU plugin";
}
}
+ if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
+ throughputStreams = 1;
}
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/config.h b/inference-engine/src/mkldnn_plugin/config.h
index 0bb390c51..558ac87ae 100644
--- a/inference-engine/src/mkldnn_plugin/config.h
+++ b/inference-engine/src/mkldnn_plugin/config.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -15,7 +14,10 @@ struct Config {
bool collectPerfCounters = false;
bool exclusiveAsyncRequests = false;
bool enableDynamicBatch = false;
+ std::string dumpToDot = "";
int batchLimit = 0;
+ int throughputStreams = 1;
+ int threadsNum = 0;
void readProperties(const std::map<std::string, std::string> &config);
};
diff --git a/inference-engine/src/mkldnn_plugin/mean_image.cpp b/inference-engine/src/mkldnn_plugin/mean_image.cpp
index ff87e1466..f1ac17e9a 100644
--- a/inference-engine/src/mkldnn_plugin/mean_image.cpp
+++ b/inference-engine/src/mkldnn_plugin/mean_image.cpp
@@ -1,10 +1,10 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
#include "mean_image.h"
#include "ie_parallel.hpp"
+#include "ie_memcpy.h"
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
@@ -54,7 +54,8 @@ void MeanImage::Load(const MKLDNNDims& inputDims, InputInfo::Ptr inputInfo) {
THROW_IE_EXCEPTION << "mean image size does not match expected network input, expecting " << meanWidth << " x " << meanHeight;
}
// todo: cast to TBlob and make sure it is floats
- memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBlob->buffer(), meanBlob->byteSize());
+ ie_memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBuffer->byteSize() - channel*meanBlob->byteSize(),
+ meanBlob->buffer(), meanBlob->byteSize());
}
}
break;
diff --git a/inference-engine/src/mkldnn_plugin/mean_image.h b/inference-engine/src/mkldnn_plugin/mean_image.h
index c27d667e2..24dc8163a 100644
--- a/inference-engine/src/mkldnn_plugin/mean_image.h
+++ b/inference-engine/src/mkldnn_plugin/mean_image.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
index 3bfae03a0..09ec76c42 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
index 0643d99a7..b3ad3c0c5 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
index 9cc57f3df..616f517aa 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
index d0b911799..57b6edc35 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -13,10 +12,6 @@
namespace mkldnn {
-template <> struct handle_traits<mkldnn_primitive_desc_iterator_t> {
- static constexpr auto destructor = &mkldnn_primitive_desc_iterator_destroy;
-};
-
struct primitive_desc_iterator : public handle<mkldnn_primitive_desc_iterator_t> {
template <typename T>
primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr, const engine &aengine) {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
index 834f8bd6e..ff3616a44 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -33,6 +32,7 @@ impl_desc_type MKLDNNPlugin::parse_impl_name(std::string impl_desc_name) {
res = static_cast<impl_desc_type>(res | impl_desc_type::_key);
SEARCH_WORD_2(nchw, ref);
+ SEARCH_WORD_2(ncdhw, ref);
SEARCH_WORD_2(wino, winograd);
#undef SEARCH_WORD_2
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
index 75a618927..45cca0402 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp
new file mode 100644
index 000000000..19bc513f6
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstdlib>
+#include <cstring>
+#include "ie_parallel.hpp"
+#include "omp_manager.h"
+
+using namespace MKLDNNPlugin;
+namespace MKLDNNPlugin {
+namespace cpu {
+
+static const char *openMpEnvVars[] = {
+ "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
+ "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
+ "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
+ "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
+ "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
+ "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
+ "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
+ "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
+ "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
+};
+
+static const unsigned numberOfOpenMpEnvVars =
+ sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
+
+bool checkOpenMpEnvVars(bool includeOMPNumThreads) {
+ for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
+ if (getenv(openMpEnvVars[i])) {
+ if (0 != strcmp(openMpEnvVars[i], "OMP_NUM_THREADS") || includeOMPNumThreads)
+ return true;
+ }
+ }
+ return false;
+}
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+// getNumberOfCPUSockets/getNumberOfCPUCores are implemented in the lin_omp_manager.cpp
+#else
+int getNumberOfCPUSockets() {return 1;}
+int getNumberOfCPUCores() {return parallel_get_max_threads();}
+#endif
+
+} // namespace cpu
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
index 26cba003e..65cc216e4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -9,10 +8,15 @@
*/
#pragma once
-#ifdef _WIN32
- #include "mkldnn/os/win/win_omp_manager.h"
-#elif defined(__APPLE__)
- #include "mkldnn/os/osx/osx_omp_manager.h"
-#else
- #include "mkldnn/os/lin/lin_omp_manager.h"
-#endif
+namespace MKLDNNPlugin {
+namespace cpu {
+
+bool checkOpenMpEnvVars(bool includeOMPNumThreads = true);
+// numbers of CPU sockets in the machine (on Linux), 1 on all other OSes
+int getNumberOfCPUSockets();
+// numbers of CPU physical cores on Linux (which is considered to be more performance friendly for servers)
+// (on other OSes it simply relies on the original parallel API of choice, which usually use the logical cores )
+int getNumberOfCPUCores();
+
+} // namespace cpu
+} // namespace MKLDNNPlugin \ No newline at end of file
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
index 75f2e4c6b..14c3e1d1d 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
@@ -1,10 +1,8 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
#include "lin_omp_manager.h"
-#include "ie_parallel.hpp"
#include <fstream>
#include <set>
#include <string>
@@ -19,20 +17,13 @@ namespace cpu {
Processor::Processor() {
processor = 0;
physicalId = 0;
- siblings = 0;
- coreId = 0;
cpuCores = 0;
- speedMHz = 0;
}
CpuInfo::CpuInfo() {
loadContentFromFile("/proc/cpuinfo");
}
-CpuInfo::CpuInfo(const char *content) {
- loadContent(content);
-}
-
void CpuInfo::loadContentFromFile(const char *fileName) {
std::ifstream file(fileName);
std::string content(
@@ -98,10 +89,6 @@ Collection::Collection(CpuInfoInterface *cpuInfo) : cpuInfo(*cpuInfo) {
collectBasicCpuInformation();
}
-unsigned Collection::getProcessorSpeedMHz() {
- return processors.size() ? processors[0].speedMHz : 0;
-}
-
unsigned Collection::getTotalNumberOfSockets() {
return totalNumberOfSockets;
}
@@ -114,10 +101,6 @@ unsigned Collection::getNumberOfProcessors() {
return processors.size();
}
-const Processor &Collection::getProcessor(unsigned processorId) {
- return processors[processorId];
-}
-
void Collection::parseCpuInfo() {
const char *cpuInfoLine = cpuInfo.getFirstLine();
for (; cpuInfoLine; cpuInfoLine = cpuInfo.getNextLine()) {
@@ -148,21 +131,9 @@ void Collection::parseValue(const char *fieldName, const char *valueString) {
currentProcessor->physicalId = parseInteger(valueString);
}
- if (beginsWith(fieldName, "siblings")) {
- currentProcessor->siblings = parseInteger(valueString);
- }
-
- if (beginsWith(fieldName, "core id")) {
- currentProcessor->coreId = parseInteger(valueString);
- }
-
if (beginsWith(fieldName, "cpu cores")) {
currentProcessor->cpuCores = parseInteger(valueString);
}
-
- if (beginsWith(fieldName, "model name")) {
- currentProcessor->speedMHz = extractSpeedFromModelName(valueString);
- }
}
void Collection::appendNewProcessor() {
@@ -184,32 +155,6 @@ unsigned Collection::parseInteger(const char *text) const {
return atol(text);
}
-/* Function extracts CPU speed from model name. If unit is not set it is
- assumed that values below 100 are specified in GHz, otherwise MHz */
-unsigned Collection::extractSpeedFromModelName(const char *text) const {
- text = strstr(text, "@");
- if (!text) {
- return 0;
- }
-
- char *unit;
- double speed = strtod(&text[1], &unit);
-
- while (isspace(*unit)) {
- unit++;
- }
-
- bool isMHz = !strncmp(unit, "MHz", 3);
- bool isGHz = !strncmp(unit, "GHz", 3);
- bool isGHzPossible = (speed < 100);
-
- if (isGHz || (isGHzPossible && !isMHz)) {
- return 1000 * speed + 0.5;
- } else {
- return speed + 0.5;
- }
-}
-
void Collection::collectBasicCpuInformation() {
std::set<unsigned> uniquePhysicalId;
std::vector<Processor>::iterator processor = processors.begin();
@@ -229,120 +174,27 @@ void Collection::updateCpuInformation(const Processor &processor,
totalNumberOfCpuCores += processor.cpuCores;
}
-
-/* The OpenMpManager class is responsible for determining a set of all of
- available CPU cores and delegating each core to perform other tasks. The
- first of available cores is delegated for background threads, while other
- remaining cores are dedicated for OpenMP threads. Each OpenMP thread owns
- one core for exclusive use. The number of OpenMP threads is then limited
- to the number of available cores minus one. The amount of CPU cores may
- be limited by system eg. when numactl was used. */
#include <sched.h>
-static const char *openMpEnvVars[] = {
- "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
- "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
- "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
- "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
- "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
- "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
- "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
- "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
- "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
-};
-
-static const unsigned numberOfOpenMpEnvVars =
- sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
-
-OpenMpManager::OpenMpManager(Collection *collection) :
- collection(*collection), isGpuEnabled(false) {
- getOpenMpEnvVars();
- getCurrentCpuSet();
- getCurrentCoreSet();
-}
-
-OpenMpManager &OpenMpManager::getInstance() {
+int getNumberOfCPUSockets() {
static CpuInfo cpuInfo;
static Collection collection(&cpuInfo);
- static OpenMpManager openMpManager(&collection);
- return openMpManager;
-}
-
-void OpenMpManager::setGpuEnabled() {
- OpenMpManager &openMpManager = getInstance();
- openMpManager.isGpuEnabled = true;
-}
-
-void OpenMpManager::setGpuDisabled() {
- OpenMpManager &openMpManager = getInstance();
- openMpManager.isGpuEnabled = false;
-}
-
-// Ideally bind given thread to secondary logical core, if
-// only one thread exists then bind to primary one
-void OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible() {
- OpenMpManager &openMpManager = getInstance();
- if (openMpManager.isThreadsBindAllowed()) {
- int totalNumberOfAvailableCores = CPU_COUNT(&openMpManager.currentCoreSet);
- int logicalCoreToBindTo = totalNumberOfAvailableCores > 1 ? 1 : 0;
- openMpManager.bindCurrentThreadToLogicalCoreCpus(logicalCoreToBindTo);
- }
+ return collection.getTotalNumberOfSockets();
}
-void OpenMpManager::bindOpenMpThreads(int env_cores) {
- OpenMpManager &openMpManager = getInstance();
-
- if (!openMpManager.isThreadsBindAllowed())
- return;
-
- openMpManager.setOpenMpThreadNumberLimit(env_cores);
- InferenceEngine::parallel_nt(0, [&] (unsigned logicalCoreId, int nthr) {
- openMpManager.bindCurrentThreadToLogicalCoreCpu(logicalCoreId);
- });
-}
-
-int OpenMpManager::getOpenMpThreadNumber() {
- OpenMpManager &openMpManager = getInstance();
-
- return openMpManager.getCoreNumber();
-}
-
-
-void OpenMpManager::getOpenMpEnvVars() {
- isAnyOpenMpEnvVarSpecified = false;
- for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
- if (getenv(openMpEnvVars[i])) {
- isAnyOpenMpEnvVarSpecified = true;
- }
- }
-}
-
-void OpenMpManager::getCurrentCpuSet() {
- if (sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet)) {
- getDefaultCpuSet(&currentCpuSet);
- }
-}
-
-void OpenMpManager::getDefaultCpuSet(cpu_set_t *defaultCpuSet) {
- CPU_ZERO(defaultCpuSet);
- unsigned numberOfProcessors = collection.getNumberOfProcessors();
- for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
- CPU_SET(processorId, defaultCpuSet);
- }
-}
-
-/* Function getCurrentCoreSet() fills currentCoreSet variable with a set of
- available CPUs, where only one CPU per core is chosen. When multiple CPUs
- of single core are used, function is selecting only first one of all
- available. */
-void OpenMpManager::getCurrentCoreSet() {
+int getNumberOfCPUCores() {
+ static CpuInfo cpuInfo;
+ static Collection collection(&cpuInfo);
unsigned numberOfProcessors = collection.getNumberOfProcessors();
unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
- cpu_set_t usedCoreSet;
+ cpu_set_t usedCoreSet, currentCoreSet, currentCpuSet;
+ CPU_ZERO(&currentCpuSet);
CPU_ZERO(&usedCoreSet);
CPU_ZERO(&currentCoreSet);
+ sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet);
+
for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
if (CPU_ISSET(processorId, &currentCpuSet)) {
unsigned coreId = processorId % totalNumberOfCpuCores;
@@ -352,70 +204,9 @@ void OpenMpManager::getCurrentCoreSet() {
}
}
}
-}
-
-void OpenMpManager::selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId) {
- unsigned numberOfProcessors = collection.getNumberOfProcessors();
- unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
-
- int processorId = physicalCoreId % totalNumberOfCpuCores;
- while (processorId < numberOfProcessors) {
- if (CPU_ISSET(processorId, &currentCpuSet)) {
- CPU_SET(processorId, set);
- }
-
- processorId += totalNumberOfCpuCores;
- }
-}
-
-unsigned OpenMpManager::getPhysicalCoreId(unsigned logicalCoreId) {
- unsigned numberOfProcessors = collection.getNumberOfProcessors();
-
- for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
- if (CPU_ISSET(processorId, &currentCoreSet)) {
- if (!logicalCoreId--) {
- return processorId;
- }
- }
- }
-
- std::cerr << "This should never happen!";
- return 0;
-}
-
-bool OpenMpManager::isThreadsBindAllowed() {
- return !isAnyOpenMpEnvVarSpecified && !isGpuEnabled;
-}
-
-// Limit of threads to number of logical cores available
-void OpenMpManager::setOpenMpThreadNumberLimit(int env_cores) {
- parallel_set_num_threads(env_cores == 0 ? CPU_COUNT(&currentCoreSet) : 0);
-}
-
-int OpenMpManager::getCoreNumber() {
return CPU_COUNT(&currentCoreSet);
}
-void OpenMpManager::bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId) {
- unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
-#if IE_THREAD == IE_THREAD_OMP
- cpu_set_t set;
- CPU_ZERO(&set);
- CPU_SET(physicalCoreId, &set);
- sched_setaffinity(0, sizeof(set), &set);
-#endif
-}
-
-void OpenMpManager::bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId) {
- unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
-#if IE_THREAD == IE_THREAD_OMP
- cpu_set_t set;
- CPU_ZERO(&set);
- selectAllCoreCpus(&set, physicalCoreId);
- sched_setaffinity(0, sizeof(set), &set);
-#endif
-}
-
#endif // #ifndef APPLE
} // namespace cpu
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
index d39329a6b..dfd69bbb4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -20,10 +19,7 @@ namespace cpu {
struct Processor {
unsigned processor;
unsigned physicalId;
- unsigned siblings;
- unsigned coreId;
unsigned cpuCores;
- unsigned speedMHz;
Processor();
};
@@ -41,8 +37,6 @@ class CpuInfo : public CpuInfoInterface {
public:
CpuInfo();
- explicit CpuInfo(const char *content);
-
virtual ~CpuInfo();
virtual const char *getFirstLine();
@@ -64,32 +58,17 @@ private:
class CollectionInterface {
public:
virtual ~CollectionInterface() {}
-
- virtual unsigned getProcessorSpeedMHz() = 0;
-
virtual unsigned getTotalNumberOfSockets() = 0;
-
- virtual unsigned getTotalNumberOfCpuCores() = 0;
-
- virtual unsigned getNumberOfProcessors() = 0;
-
- virtual const Processor &getProcessor(unsigned processorId) = 0;
};
class Collection : public CollectionInterface {
public:
explicit Collection(CpuInfoInterface *cpuInfo);
- virtual unsigned getProcessorSpeedMHz();
-
virtual unsigned getTotalNumberOfSockets();
-
virtual unsigned getTotalNumberOfCpuCores();
-
virtual unsigned getNumberOfProcessors();
- virtual const Processor &getProcessor(unsigned processorId);
-
private:
CpuInfoInterface &cpuInfo;
unsigned totalNumberOfSockets;
@@ -113,70 +92,11 @@ private:
unsigned parseInteger(const char *text) const;
- unsigned extractSpeedFromModelName(const char *text) const;
-
void collectBasicCpuInformation();
void updateCpuInformation(const Processor &processor,
unsigned numberOfUniquePhysicalId);
};
-
-
-class OpenMpManager {
-public:
- static void setGpuEnabled();
-
- static void setGpuDisabled();
-
- static void bindCurrentThreadToNonPrimaryCoreIfPossible();
-
- static void bindOpenMpThreads(int env_cores = 0);
-
- static int getOpenMpThreadNumber();
-
- static void printVerboseInformation();
-
- static bool isMajorThread(int currentThread);
-
-private:
- Collection &collection;
-
- bool isGpuEnabled;
- bool isAnyOpenMpEnvVarSpecified;
- cpu_set_t currentCpuSet;
- cpu_set_t currentCoreSet;
-
- explicit OpenMpManager(Collection *collection);
-
- OpenMpManager(const OpenMpManager &openMpManager);
-
- OpenMpManager &operator=(const OpenMpManager &openMpManager);
-
- static OpenMpManager &getInstance();
-
- void getOpenMpEnvVars();
-
- void getCurrentCpuSet();
-
- int getCoreNumber();
-
- void getDefaultCpuSet(cpu_set_t *defaultCpuSet);
-
- void getCurrentCoreSet();
-
- void selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId);
-
- unsigned getPhysicalCoreId(unsigned logicalCoreId);
-
- bool isThreadsBindAllowed();
-
- void setOpenMpThreadNumberLimit(int env_cores);
-
- void bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId);
-
- void bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId);
-};
-
#endif // #ifndef __APPLE__
} // namespace cpu
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h
deleted file mode 100644
index 0484bb571..000000000
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-/**
-* @brief WINAPI based code
-* @file win_omp_manager.h
-*/
-
-#pragma once
-
-#include <thread>
-#include <vector>
-
-namespace MKLDNNPlugin {
-namespace cpu {
-
-class OpenMpManager {
-public:
- static int getOpenMpThreadNumber() {
- return getCoreNumber();
- }
-
- static int getCoreNumber() {
- return 4;
- }
-};
-
-} // namespace cpu
-} // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h
deleted file mode 100644
index d59891679..000000000
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-/**
-* @brief WINAPI based code
-* @file win_omp_manager.h
-*/
-
-#pragma once
-
-#include <thread>
-#include <vector>
-#include <windows.h>
-
-namespace MKLDNNPlugin {
-namespace cpu {
-
-class OpenMpManager {
-public:
- static int getOpenMpThreadNumber() {
- return getCoreNumber();
- }
-
- static int getCoreNumber() {
- int num_cores = std::thread::hardware_concurrency();
- unsigned long size = 0;
-
- if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size)) {
- if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
- std::vector<char> buf(size);
- SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info
- = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(&buf.front());
- SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* ptr = info;
- if (GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size)) {
- if (GetLastError() == ERROR_SUCCESS) {
- int num = 0;
- unsigned long offset = 0;
- while (offset < size) {
- num++;
- offset += ptr->Size;
- ptr = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(
- reinterpret_cast<byte*>(ptr) + ptr->Size);
- }
- num_cores = num;
- }
- }
- }
- }
- return num_cores;
- }
-};
-
-} // namespace cpu
-} // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
index e117182dd..ea463a2d5 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
index be18a41e2..447787f88 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
index 9ea3fe38c..bcb47419e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
index 51a29c21e..dff072089 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
index f707f268c..06616a8be 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
index 102955fbb..92c8c5ad3 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
index 91c586b4a..f5364f614 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
index c9ca08ab2..b362433eb 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
index bd1e0d8d5..f3abd8b4a 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
index 681061a4d..3600ee56c 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
index fb7953f17..8b2994e5f 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
index 983fc2b35..9c079efd4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -11,6 +10,7 @@
#include <limits>
#include <fstream>
#include <unordered_map>
+#include <memory>
#include "details/caseless.hpp"
#include "mkldnn_graph.h"
@@ -24,7 +24,6 @@
#include "mkldnn_extension_utils.h"
#include "mkldnn_extension_mngr.h"
#include "mkldnn/omp_manager.h"
-#include "ie_parallel.hpp"
#include <graph_tools.hpp>
#include <cpp_interfaces/ie_executor_manager.hpp>
#include "ie_algorithm.hpp"
@@ -33,21 +32,34 @@
#include "mkldnn_async_infer_request.h"
#include <blob_factory.hpp>
#include <ie_util_internal.hpp>
+#include <net_pass.h>
+
+#include <mkldnn_graph_dumper.h>
#include <data_stats.h>
-#include "../inference_engine/cnn_network_int8_normalizer.hpp"
+#include "cnn_network_int8_normalizer.hpp"
+#include "ie_memcpy.h"
#define XBYAK_NO_OP_NAMES
#define XBYAK_UNDEF_JNL
#include "../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h"
#include "cnn_network_stats_impl.hpp"
-// #define DEBUG_DUMP_PATH "/temp/path/dump/"
-// #define DEBUG_DUMP_NEW_FOLDER_PER_INFER
-#ifdef DEBUG_DUMP_PATH
-#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
-#include <iomanip>
-// #define DEBUG_BMP_OUTPUT 1
+
+#include "utils/blob_dump.h"
+
+/*****************************************************
+ * Dump capability
+ * Specify path to dump folder in BLOB_DUMP_PATH
+ *****************************************************/
+// #define BLOB_DUMP_PATH "dump"
+
+#ifdef BLOB_DUMP_PATH
+# define DUMP_DIR BLOB_DUMP_PATH
+# define ENABLE_DUMP(_x) { _x ;}
+#else
+# define DUMP_DIR ""
+# define ENABLE_DUMP(_x)
#endif
using namespace mkldnn;
@@ -56,37 +68,11 @@ using namespace MKLDNNPlugin::cpu;
using namespace InferenceEngine;
using namespace InferenceEngine::details;
-void BindThreads(mkldnn::engine eng) {
- static bool alreadyBind = false;
- if (!alreadyBind) {
-#if IE_THREAD == IE_THREAD_OMP
- int env_cores = 0;
- if (getenv("OMP_NUM_THREADS") != nullptr) {
- try {
- env_cores = std::stoi(std::string(getenv("OMP_NUM_THREADS")));
- } catch (...) {
- env_cores = 0;
- }
- }
-#if !(defined(__APPLE__) || defined(_WIN32))
- OpenMpManager::setGpuDisabled();
- OpenMpManager::bindOpenMpThreads(env_cores);
-#else
- int num_cores = env_cores == 0 ? OpenMpManager::getOpenMpThreadNumber() : env_cores;
- parallel_set_num_threads(num_cores);
-#endif
-#endif
- alreadyBind = true;
- }
-}
-
-void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
+void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
if (IsReady()) {
ForgetGraphData();
}
- if (config.useThreadBinding) BindThreads(eng);
-
// go over the inputs and create input primitives
InputsDataMap inputs;
network.getInputsInfo(inputs);
@@ -273,6 +259,9 @@ void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager
CreatePrimitives();
+ // Will do it before cleanup. Because it will lose original layers information
+ if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_init.dot");
+
for (auto &graphNode : graphNodes) {
graphNode->cleanup();
}
@@ -378,15 +367,31 @@ void MKLDNNGraph::ParseNode(const CNNLayerPtr& cnnLayer, MKLDNNNodePtr& parent,
if (exists)
return;
+ if (cnnLayer->blobs.find("ext-scale") != cnnLayer->blobs.end())
+ node->ext_scales = cnnLayer->blobs["ext-scale"];
+
graphNodes.push_back(node);
size_t count_out = 0;
+ std::vector<ParsedLayer> remaining;
for (const auto &layer : cnnLayer->outData) {
+ bool first = true;
for (const auto &data : layer->getInputTo()) {
- queuelayers.push_back({node, data.second, count_out});
+ if (first) {
+ queuelayers.push_back({node, data.second, count_out});
+ first = false;
+ } else {
+ // TODO: Just to hide bug with port ordering.
+ // At first step we visit only first connection
+ // at port. As second we will visit all remaining.
+ //
+ // Not first connection to the port are stored here
+ remaining.push_back({node, data.second, count_out});
+ }
}
count_out++;
}
+ queuelayers.insert(queuelayers.end(), remaining.begin(), remaining.end());
}
void MKLDNNGraph::InitNodes() {
@@ -416,58 +421,6 @@ void MKLDNNGraph::InitEdges() {
if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
- } else if (inArgs.empty() && outArgs.empty()) {
- // This detailed name disabled by request from ICV team
-#if 0
- auto parentBlk = parentDesc.getBlockingDesc();
- auto childBlk = childDesc.getBlockingDesc();
- std::string order_in, order_out, stride_in, stride_out, dims_in, dims_out, off_in, off_out;
- for (size_t i = 0; i < parentBlk.getBlockDims().size(); i++) {
- if (i) {
- stride_in += ",";
- order_in += ",";
- dims_in += ",";
- off_in += ",";
- }
- stride_in += std::to_string(parentBlk.getStrides()[i]);
- order_in += std::to_string(parentBlk.getOrder()[i]);
- dims_in += std::to_string(parentBlk.getBlockDims()[i]);
- off_in += std::to_string(parentBlk.getOffsetPaddingToData()[i]);
- }
- for (size_t i = 0; i < childBlk.getBlockDims().size(); i++) {
- if (i) {
- stride_out += ",";
- order_out += ",";
- dims_out += ",";
- off_out += ",";
- }
- stride_out += std::to_string(childBlk.getStrides()[i]);
- order_out += std::to_string(childBlk.getOrder()[i]);
- dims_out += std::to_string(childBlk.getBlockDims()[i]);
- off_out += std::to_string(childBlk.getOffsetPaddingToData()[i]);
- }
-
- if (parentBlk.getOffsetPadding() != childBlk.getOffsetPadding()) {
- inArgs += (inArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(parentBlk.getOffsetPadding());
- outArgs += (outArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(childBlk.getOffsetPadding());
- }
- if (parentBlk.getStrides() != childBlk.getStrides()) {
- inArgs += (inArgs.empty() ? "" : "_") + std::string("str:") + stride_in;
- outArgs += (outArgs.empty() ? "" : "_") + std::string("str:") + stride_out;
- }
- if (parentBlk.getOrder() != childBlk.getOrder()) {
- inArgs += (inArgs.empty() ? "" : "_") + std::string("ord:") + order_in;
- outArgs += (outArgs.empty() ? "" : "_") + std::string("ord:") + order_out;
- }
- if (parentBlk.getBlockDims() != childBlk.getBlockDims()) {
- inArgs += (inArgs.empty() ? "" : "_") + std::string("dim:") + dims_in;
- outArgs += (outArgs.empty() ? "" : "_") + std::string("dim:") + dims_out;
- }
- if (parentBlk.getOffsetPaddingToData() != childBlk.getOffsetPaddingToData()) {
- inArgs += (inArgs.empty() ? "" : "_") + std::string("offs:") + off_in;
- outArgs += (outArgs.empty() ? "" : "_") + std::string("offs:") + off_out;
- }
-#endif
}
return inArgs + "_" + outArgs;
};
@@ -529,7 +482,7 @@ static inline bool isConstOutput(MKLDNNEdgePtr edge) {
void MKLDNNGraph::AllocateWithReuse() {
std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters;
- // detect edge clasters which are view on one.
+ // detect edge clusters which are view on one.
for (auto &edge : graphEdges) {
MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated)
? edge->getSharedEdge()
@@ -606,7 +559,7 @@ void MKLDNNGraph::AllocateWithReuse() {
int e_size = block_desk.getOffsetPadding() + 1; // size in elements (from begin of data to last element)
for (int j = 0; j < block_desk.getBlockDims().size(); j++)
- e_size += (block_desk.getBlockDims()[j] - 1 ) * block_desk.getStrides()[j];
+ e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j];
box.start = std::min(e_start, box.start);
box.finish = std::max(e_finish, box.finish);
@@ -754,139 +707,9 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
MB_to_process = std::min<int>(config.batchLimit, MB_to_process);
size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
- memcpy(ext_blob_ptr, intr_blob_ptr, size_to_copy);
- }
-}
-
-#ifdef DEBUG_BMP_OUTPUT
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "../../thirdparty/stb_lib/stb_image_write.h"
-
-#if defined(_WIN32)
-#define mkdir(dir, mode) _mkdir(dir)
-#endif
-
-void dump_as_bitmaps(const std::string name, const float* data,
- const SizeVector& cdims,
- mkldnn::impl::memory_format_t format = mkldnn::impl::memory_format::nchw) {
- std::string dir_name = name + "_bmp_dir/";
- mkdir(dir_name.c_str(), 0755);
-
- std::ofstream layer_bmp_log;
- layer_bmp_log.open(dir_name + "bmp_dump_log.txt");
- layer_bmp_log << "Format " << format << std::endl;
-
- if (cdims.size() == 1) {
- layer_bmp_log << "Only one dimension: " << cdims[0] << std::endl;
- layer_bmp_log.close();
- return;
- }
-
- SizeVector dims(cdims.rbegin(), cdims.rend());
-
- size_t x = dims[0], y = dims[1], total_images = 1;
- size_t img_sz = x*y;
-
- for (size_t k = 0; k < dims.size(); ++k)
- if (dims[k])
- total_images *= dims[k];
-
- total_images /= img_sz;
-
- // sanity checks
- if (img_sz < 100) {
- layer_bmp_log << "Image size is too small" << std::endl;
- layer_bmp_log.close();
- return;
- } else if (x < 10 || y < 10 || x > 2048 || y > 2048) {
- layer_bmp_log << "Dimensions are unapropriate to dump - " << y << "x" << x << std::endl;
- layer_bmp_log.close();
- return;
- } else {
- float ratio = static_cast<float>(x) / static_cast<float>(y);
- if (ratio < 1.0) ratio = 1.0 / ratio;
-
- if (ratio > 8.f) {
- layer_bmp_log << "Suspicious aspect ratio - " << ratio << std::endl;
- layer_bmp_log.close();
- return;
- }
- }
-
- layer_bmp_log << total_images << " images to write ..." << std::endl;
-
- const float* dataPtr = data;
- for (size_t img = 0; img < total_images; img++) {
- std::string img_name = "img" + std::to_string(img) + ".bmp";
-
- // copy image plane to separate buffer,
- // normalize and convert to 3-channel 8-bit bmp
- std::vector<float> imgbuf(img_sz);
- int stride = 1;
- switch (format) {
- case mkldnn::impl::memory_format::nChw8c:
- stride = 8;
- break;
- case mkldnn::impl::memory_format::nChw16c:
- stride = 16;
- break;
- case mkldnn::impl::memory_format::nchw:
- default:
- break;
- }
-
- float maxval = -FLT_MAX, minval = FLT_MAX;
- for (size_t i = 0; i < y; i++)
- for (size_t j = 0; j < x; j++) {
- float val = dataPtr[(i*x + j) * stride];
- if (val > maxval) maxval = val;
- if (val < minval) minval = val;
- imgbuf[i*x + j] = val;
- }
-
- if (minval >= 0.f && maxval <= 0.f) {
- layer_bmp_log << img_name << " all zero." << std::endl;
- } else {
- const float mult = 256.f / (maxval - minval);
- std::vector<unsigned char> bmpbuf(img_sz * 3);
- unsigned char* bmp_ptr = bmpbuf.data();
-
- for (int i = 0; i < imgbuf.size(); i++, bmp_ptr += 3) {
- if (imgbuf[i] >= 0.f && imgbuf[i] <= 0.f) {
- bmp_ptr[0] = 65;
- bmp_ptr[1] = bmp_ptr[2] = 0;
- } else {
- bmp_ptr[0] = bmp_ptr[1] = bmp_ptr[2] = (unsigned char)((imgbuf[i] - minval) * mult);
- }
- }
-
- // write bmp file
- std::string full_name = dir_name + img_name;
- stbi_write_bmp(full_name.c_str(), x, y, 3, (const void *)bmpbuf.data());
- }
-
- switch (format) {
- case mkldnn::impl::memory_format::nChw8c:
- if ( ( img & 7 ) < 7 ) dataPtr++;
- else dataPtr += img_sz * 8;
- break;
- case mkldnn::impl::memory_format::nChw16c:
- if ( ( img & 15 ) < 15 ) dataPtr++;
- else dataPtr += img_sz * 16;
- break;
- case mkldnn::impl::memory_format::nchw:
- default:
- dataPtr += img_sz;
- break;
- }
+ ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
}
-
- layer_bmp_log.close();
}
-#endif
void MKLDNNGraph::Infer(int batch) {
if (!IsReady()) {
@@ -894,175 +717,20 @@ void MKLDNNGraph::Infer(int batch) {
}
mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
-#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER
- static int folderIdx = 0;
- folderIdx++;
-#endif
for (int i = 0; i < graphNodes.size(); i++) {
PERF(graphNodes[i]);
if (batch > 0)
graphNodes[i]->setDynamicBatchLim(batch);
+ ENABLE_DUMP(do_before(DUMP_DIR, graphNodes[i]));
+
if (!graphNodes[i]->isConstant()) {
IE_PROFILING_AUTO_SCOPE_TASK(graphNodes[i]->profilingTask)
graphNodes[i]->execute(stream);
}
-#ifdef DEBUG_DUMP_PATH
- {
- auto folderName = std::string(DEBUG_DUMP_PATH) +
-#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER
- std::to_string(folderIdx - 1) +
-#endif
- "/";
- std::cout << "Try to create logs for " << graphNodes[i]->getName() << std::endl;
- std::string nodeName = graphNodes[i]->name;
- std::replace(nodeName.begin(), nodeName.end(), '/', '_');
- std::ofstream layer_data_dump;
- for (size_t j = 0; j < graphNodes[i]->getChildEdges().size(); j++) {
- auto childEdge = graphNodes[i]->getChildEdgeAt(j);
- std::string childName = graphNodes[i]->getChildEdgeAt(j)->getChild()->getName();
- std::replace(childName.begin(), childName.end(), '/', '_');
-
- // std::string fname = DEBUG_DUMP_PATH + nodeName + "_dst_" + childName + "_" + std::to_string(j) + ".txt";
- std::string tname = folderName + nodeName + "_dst_" + childName + "_" + std::to_string(j);
- std::string fname = tname + ".txt";
- if (graphNodes[i]->getChildEdges().size() == 1) {
- fname = folderName + nodeName + "_dst.txt";
- }
- layer_data_dump.open(fname);
- if (layer_data_dump.is_open()) {
- float *data = static_cast<float *>(childEdge->getMemory().GetData());
- mkldnn::impl::memory_desc_wrapper dst_d(childEdge->getMemory().GetDescriptor().data);
- #ifdef DEBUG_BMP_OUTPUT
- dump_as_bitmaps(tname, data, childEdge->getDims().ToSizeVector(), dst_d.format());
- #endif
-
- layer_data_dump << "shape: ";
- for (size_t d = 0; d < childEdge->getDims().ndims(); d++)
- layer_data_dump << childEdge->getDims()[d] << " ";
- layer_data_dump << "(" << dst_d.nelems() << ")" << std::endl;
- if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::FP32) {
- float *data = childEdge->getBlob()->buffer();
- for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
- layer_data_dump << std::fixed << std::setprecision(3) << data[dst_d.off_l(bs)] << std::endl;
- }
- } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I8) {
- int8_t *data = childEdge->getBlob()->buffer();
- for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
- layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
- }
- } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::U8) {
- uint8_t *data = childEdge->getBlob()->buffer();
- for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
- layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
- }
- } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I32) {
- int32_t *data = childEdge->getBlob()->buffer();
- for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
- layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
- }
- }
-
- layer_data_dump.close();
- } else {
- std::cout << "Cannot create file " << fname << std::endl;
- }
- }
-
- for (size_t p = 0 ; p < graphNodes[i]->getParentEdges().size(); p++) {
- auto parentEdge = graphNodes[i]->getParentEdgeAt(p);
- auto parent = parentEdge->getParent();
- std::string parentName = parent->getName();
- std::replace(parentName.begin(), parentName.end(), '/', '_');
- // std::string fname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p) + ".txt";
- std::string tname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p);
- std::string fname = tname + ".txt";
- layer_data_dump.open(fname);
- if (layer_data_dump.is_open()) {
- size_t dataSize = graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetSize();
- mkldnn::impl::memory_desc_wrapper src_d(graphNodes[i]->getParentEdges()[p]
- .lock()->getMemory().GetDescriptor().data);
- #ifdef DEBUG_BMP_OUTPUT
- dump_as_bitmaps(tname, data, parentEdge->getDims().ToSizeVector(), src_d.format());
- #endif
- layer_data_dump << "shape: ";
- for (size_t d = 0; d < parentEdge->getDims().ndims(); d++)
- layer_data_dump << parentEdge->getDims()[d] << " ";
- layer_data_dump << "(" << src_d.nelems() << ")"<< std::endl;
- auto precision = graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision();
- if (precision == Precision::FP32) {
- float *data = static_cast<float *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
- for (size_t bs = 0; bs < dataSize; bs++) {
- layer_data_dump << std::fixed << std::setprecision(3) << data[src_d.off_l(bs)] << std::endl;
- }
- } else if (precision == Precision::I8) {
- int8_t *data = static_cast<int8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
- for (size_t bs = 0; bs < dataSize; bs++) {
- layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
- }
- } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::U8) {
- uint8_t *data = static_cast<uint8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
- for (size_t bs = 0; bs < dataSize; bs++) {
- layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
- }
- } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::I32) {
- int32_t *data = static_cast<int32_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
- for (size_t bs = 0; bs < dataSize; bs++) {
- layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
- }
- } else {
- layer_data_dump << "Unsupported precision: " << precision.name() << std::endl;
- }
-
- layer_data_dump.close();
- } else {
- std::cout << "Cannot create file " << fname << std::endl;
- }
- }
-
- GenericLayer* genericLayer = dynamic_cast<GenericLayer*>(graphNodes[i]->getCnnLayer().get());
- if (genericLayer != nullptr) {
- for (auto blob : genericLayer->blobs) {
- layer_data_dump.open(folderName + nodeName + "_blob-" + blob.first + ".txt");
- if (layer_data_dump.is_open()) {
- layer_data_dump << "shape: ";
- for (size_t d = 0; d < blob.second->dims().size(); d++)
- layer_data_dump << blob.second->dims()[d] << " ";
- layer_data_dump << "(" << blob.second->size() << ")"<< std::endl;
- if (blob.second->getTensorDesc().getPrecision() == Precision::FP32) {
- float *data = blob.second->buffer();
- for (size_t bs = 0; bs < blob.second->size(); bs++) {
- layer_data_dump << std::fixed << std::setprecision(3) << data[bs] << std::endl;
- }
- } else if (blob.second->getTensorDesc().getPrecision() == Precision::I8) {
- int8_t *data = blob.second->buffer();
- for (size_t bs = 0; bs < blob.second->size(); bs++) {
- layer_data_dump << static_cast<int>(data[bs]) << std::endl;
- }
- } else if (blob.second->getTensorDesc().getPrecision() == Precision::U8) {
- uint8_t *data = blob.second->buffer();
- for (size_t bs = 0; bs < blob.second->size(); bs++) {
- layer_data_dump << static_cast<int>(data[bs]) << std::endl;
- }
- } else if (blob.second->getTensorDesc().getPrecision() == Precision::I32) {
- int32_t *data = blob.second->buffer();
- for (size_t bs = 0; bs < blob.second->size(); bs++) {
- layer_data_dump << static_cast<int>(data[bs]) << std::endl;
- }
- } else {
- layer_data_dump << "Unsupported precision: " << blob.second->getTensorDesc().getPrecision().name() << std::endl;
- }
- layer_data_dump.close();
- } else {
- std::cout << "Cannot create file " << folderName << nodeName
- << "_" << blob.first << ".txt" << std::endl;
- }
- }
- }
- }
-#endif
+ ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
}
}
@@ -1153,6 +821,8 @@ void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEn
for (int i = 1; i < graphNodes.size(); i++) {
getPerfMapFor(perfMap, graphNodes[i]);
}
+
+ if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot");
}
void MKLDNNGraph::setConfig(const Config &cfg) {
@@ -1257,7 +927,56 @@ void MKLDNNGraph::RemoveDroppedEdges() {
}
}
-bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const {
+void MKLDNNGraph::dumpToDotFile(std::string file) const {
+ std::ofstream dot;
+ dot.open(file);
+ if (!dot.is_open()) THROW_IE_EXCEPTION << "CPU Plugin cannot create dot file " << file << ".";
+
+ dump_graph_as_dot(*this, dot);
+ dot.close();
+}
+
+void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) {
+ auto exec_order = std::to_string(node->execIndex);
+ std::string nodeName = node->name;
+ std::replace(nodeName.begin(), nodeName.end(), '/', '_');
+
+ auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size();
+ for (size_t i = 0; i < num_ports; i++) {
+ auto prEdge = node->getParentEdgeAt(i);
+ auto pr = prEdge->getParent();
+
+ auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_in" + std::to_string(i) + ".ieb";
+ TensorDesc desc = prEdge->getDesc();
+ Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData());
+
+ BlobDumper dumper(blob);
+ if (pr->ext_scales) dumper.withScales(pr->ext_scales);
+ dumper.dump(dump_file);
+ }
+}
+
+void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) {
+ auto exec_order = std::to_string(node->execIndex);
+ auto nodeName = node->name;
+ std::replace(nodeName.begin(), nodeName.end(), '/', '_');
+
+ auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size();
+ for (size_t i = 0; i < num_ports; i++) {
+ auto childEdge = node->getChildEdgeAt(i);
+
+ auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_out" + std::to_string(i) + ".ieb";
+ TensorDesc desc = childEdge->getDesc();
+ Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData());
+
+ BlobDumper dumper(blob);
+ if (node->ext_scales) dumper.withScales(node->ext_scales);
+
+ dumper.dump(dump_file);
+ }
+}
+
+bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const {
InputsDataMap inputs;
network.getInputsInfo(inputs);
@@ -1274,6 +993,11 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
bool check_result = true;
details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) {
auto type = TypeFromName(layer->type);
+ // This is WA for Tile layer
+ auto tileLayer = dynamic_cast<TileLayer *>(layer.get());
+ if (tileLayer && tileLayer->axis)
+ return;
+
if (type != Input &&
type != Output &&
type != Convolution &&
@@ -1283,6 +1007,7 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
type != Lrn &&
type != Pooling &&
type != FullyConnected &&
+ type != Gemm &&
type != SoftMax &&
type != Split &&
type != Concatenation &&
@@ -1301,55 +1026,87 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
InferenceEngine::InferRequestInternal::Ptr
MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
InferenceEngine::OutputsDataMap networkOutputs) {
- return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
+ if (graphs.size() > 1) // streams uses special requests that are not connected to graphs
+ return std::make_shared<MKLDNNGraphlessInferRequest>(networkInputs, networkOutputs);
+ else
+ return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
}
-MKLDNNExecNetwork::MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network,
+MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
const Config &cfg,
const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) {
- graph.reset(new MKLDNNGraph());
- graph->setConfig(cfg);
+ ICNNNetworkStats* pstats = nullptr;
+ StatusCode s = network.getStats(&pstats, nullptr);
+ // we are cloning network if we have statistics and we can transform network
+ // in other case we pass original network. Especially because LSTM networks
+ // are not cloned properly
+ details::CNNNetworkImplPtr clonedNetwork;
+ if (s == StatusCode::OK && pstats && !pstats->isEmpty()) {
+ CNNNetworkInt8Normalizer cnnorm;
+ clonedNetwork = cloneNet(network);
+ cnnorm.NormalizeNetwork(*clonedNetwork, *pstats);
+ }
+ bool ti_proc_ok = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true;
+ if (!ti_proc_ok)
+ THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. "
+ "None TI optimization pattern has been applied successfully";
+
if (cfg.batchLimit > 1) {
// check topology for applicability
- if (!CanProcessDynBatch(network)) {
+ if (!CanProcessDynBatch(clonedNetwork ? *clonedNetwork : network)) {
THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
}
}
+ // check whether any (affinity-related) envs are set and if user requested thread binding
+ const bool bPinningRequested = !check_env_variables() && cfg.useThreadBinding;
+ // general #threads logic
+ const int env_threads = parallel_get_env_threads();
+ // for streams need all (logical) cores, while single-stream case just physical cores (better for servers), as usual
+ const int hw_cores = cfg.throughputStreams > 1 ? parallel_get_max_threads() : getNumberOfCPUCores();
+ const int threads = cfg.threadsNum ? cfg.threadsNum : (env_threads ? env_threads : hw_cores);
+ const int threads_per_stream = std::max(1, threads/cfg.throughputStreams);
+
+ // graph(s) initialization in taskExecutor threads (streams), in parallel (in case of streams)
+ std::vector<Task::Ptr> tasks;
+
+ for (int n = 0; n < cfg.throughputStreams; n++) {
+ MKLDNNGraph::Ptr _graph = std::make_shared<MKLDNNGraph>();
+ graphs.push_back(_graph);
+ auto task = std::make_shared<InferenceEngine::Task>([=, &cfg, &network]() {
+ _graph->CreateArena(threads_per_stream);
+
+ if (bPinningRequested) {
+ _graph->CreateObserver(n, threads_per_stream);
+ }
- if (graph->getProperty().exclusiveAsyncRequests) {
- ExecutorManager *executorManager = ExecutorManager::getInstance();
- _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
+ _graph->setConfig(cfg);
+ _graph->CreateGraph(clonedNetwork ? *clonedNetwork : network, extensionManager);
+ if (cfg.throughputStreams > 1) // for streams, each worker thread has it's own graph
+ MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph;
+ });
+ tasks.push_back(task);
}
- // initialization in taskExecutor thread
- auto task = std::make_shared<InferenceEngine::Task>([&]() {
- // we are cloning network if we have statistics and we can transform network
- // in other case we pass original network. Especially because LSTM networks
- // are not cloned properly
- ICNNNetworkStats* pstats = nullptr;
- StatusCode s = network.getStats(&pstats, nullptr);
- Xbyak::util::Cpu cpu;
- // Enable int8 only for avx512
- if (s == StatusCode::OK && pstats && !pstats->isEmpty() && cpu.has(Xbyak::util::Cpu::tAVX512F)) {
- details::CNNNetworkImplPtr clonnedNetwork = cloneNet(network);
- CNNNetworkInt8Normalizer cnnorm;
- cnnorm.NormalizeNetwork(*clonnedNetwork, *pstats);
- graph->CreateGraph(*clonnedNetwork, extensionManager);
- } else {
- graph->CreateGraph(network, extensionManager);
+ if (cfg.throughputStreams > 1) {
+ // special executor with as many threads as requested #streams, each with it's own initialization task
+ _taskExecutor = std::make_shared<MultiWorkerTaskExecutor>(tasks);
+ } else {
+ if (cfg.exclusiveAsyncRequests) {
+ // special case when all InferRequests are muxed into a single queue
+ ExecutorManager *executorManager = ExecutorManager::getInstance();
+ _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
}
- });
-
- _taskExecutor->startTask(task);
- Task::Status sts = task->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
-
- if (sts == Task::TS_ERROR) task->checkException();
+ _taskExecutor->startTask(tasks[0]);
+ Task::Status sts = tasks[0]->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+ }
+ for (auto t : tasks)
+ t->checkException();
}
void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
- if (graph) // TODO: graph field cannot be empty
- graph->setProperty(properties);
+ for (auto g : graphs)
+ g->setProperty(properties);
}
void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
@@ -1362,13 +1119,10 @@ void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &
asyncRequestImpl->SetPointerToPublicInterface(asyncRequest);
- auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
- if (!mkldnnSyncRequest)
- THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
- mkldnnSyncRequest->SetGraph(graph);
-}
-
-MKLDNNExecNetwork::~MKLDNNExecNetwork() {
- graph.reset();
- extensionManager.reset();
+ if (graphs.size() == 1) { // single-stream (legacy/hetero) case - single graph for all requests
+ auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
+ if (!mkldnnSyncRequest)
+ THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
+ mkldnnSyncRequest->SetGraph(graphs[0]);
+ }
}
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
index d1fdb0fe9..de026b5ad 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -11,6 +10,7 @@
#include <memory>
#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include "ie_parallel.hpp"
#include "mkldnn_memory.h"
#include "config.h"
#include "perf_count.h"
@@ -19,6 +19,7 @@
#include "mkldnn_node.h"
#include "mkldnn_edge.h"
#include "mkldnn_extension_utils.h"
+#include "mkldnn_streams.h"
namespace MKLDNNPlugin {
@@ -48,7 +49,7 @@ public:
void getInputBlobs(InferenceEngine::BlobMap &in_map);
void getOutputBlobs(InferenceEngine::BlobMap &out_map);
- void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
+ void CreateGraph(const InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
bool hasMeanImageFor(const std::string& name) {
return _meanImages.find(name) != _meanImages.end();
@@ -81,6 +82,35 @@ public:
void RemoveDroppedEdges();
void DropNode(const MKLDNNNodePtr& node);
+ void CreateArena(int threads_per_stream) {
+ #if IE_THREAD == IE_THREAD_OMP
+ omp_set_num_threads(threads_per_stream);
+ #elif IE_THREAD == IE_THREAD_TBB
+ ptrArena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena(threads_per_stream));
+ #endif
+ }
+
+ void CreateObserver(int _stream_id, int _threads_per_stream, int _pinning_step = 1) {
+ #if IE_THREAD == IE_THREAD_TBB
+ ptrObserver
+ = std::unique_ptr<tbb::task_scheduler_observer>(
+ new pinning_observer(*ptrArena.get(), _stream_id, _threads_per_stream, _pinning_step));
+ #else
+ cpu_set_t *process_mask = nullptr;
+ int ncpus = 0;
+ get_process_mask(ncpus, process_mask);
+ #if IE_THREAD == IE_THREAD_OMP
+ #pragma omp parallel for
+ for (int thread_index = 0; thread_index < _threads_per_stream; thread_index++) {
+ pin_thread_to_vacant_core(_stream_id * _threads_per_stream + thread_index, 1, ncpus, process_mask);
+ }
+ #elif IE_THREAD == IE_THREAD_SEQ
+ pin_thread_to_vacant_core(_stream_id * _threads_per_stream, 1, ncpus, process_mask);
+ #endif
+ CPU_FREE(process_mask);
+ #endif
+ }
+
protected:
MKLDNNNodePtr FindNodeWithName(const std::string& name) const;
void VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes);
@@ -108,6 +138,10 @@ protected:
std::map<std::string, MeanImage> _meanImages;
+ #if IE_THREAD == IE_THREAD_TBB
+ std::unique_ptr<tbb::task_arena> ptrArena;
+ std::unique_ptr<tbb::task_scheduler_observer> ptrObserver;
+ #endif
mkldnn::engine eng;
void InitNodes();
@@ -116,13 +150,15 @@ protected:
void AllocateWithReuse();
void CreatePrimitives();
- void BreakEdgeInsertScaleShift(MKLDNNPlugin::MKLDNNEdgePtr edgeToBreak,
- InferenceEngine::CNNLayerPtr ssCnnLayer);
- void AddScaleShiftBeforeAndAfterInt8(InferenceEngine::CNNNetwork& net);
+ void do_before(const std::string &dir, const MKLDNNNodePtr &node);
+ void do_after(const std::string &dir, const MKLDNNNodePtr &node);
friend class MKLDNNInferRequest;
+ friend class MKLDNNGraphlessInferRequest;
+ friend std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph);
private:
+ void dumpToDotFile(std::string file) const;
struct ParsedLayer {
MKLDNNNodePtr parent;
InferenceEngine::CNNLayerPtr cnnLayer;
@@ -142,18 +178,21 @@ public:
void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override;
- MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network, const Config &cfg,
+ MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg,
const MKLDNNExtensionManager::Ptr& extMgr);
- ~MKLDNNExecNetwork() override;
+ ~MKLDNNExecNetwork() {
+ graphs.clear();
+ extensionManager.reset();
+ }
void setProperty(const std::map<std::string, std::string> &properties);
protected:
- MKLDNNGraph::Ptr graph;
+ std::vector<MKLDNNGraph::Ptr> graphs;
MKLDNNExtensionManager::Ptr extensionManager;
- bool CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const;
+ bool CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const;
};
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
new file mode 100644
index 000000000..ae24579f6
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
@@ -0,0 +1,207 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_graph_dumper.h"
+#include "cnn_network_impl.hpp"
+#include "ie_util_internal.hpp"
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+static void copy_node_metadata(const MKLDNNNodePtr &, CNNLayer::Ptr &);
+static void drawer_callback(const InferenceEngine::CNNLayerPtr, ordered_properties &, ordered_properties &);
+
+CNNLayer::Ptr convert_node(const MKLDNNNodePtr &node) {
+ CNNLayer::Ptr layer(new CNNLayer({"name", "type", Precision::FP32}));
+ copy_node_metadata(node, layer);
+
+ auto &cfg = node->getSelectedPrimitiveDescriptor()->getConfig();
+ layer->insData.resize(cfg.inConfs.size());
+ layer->outData.resize(cfg.outConfs.size());
+
+ return layer;
+}
+
+std::shared_ptr<ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph) {
+ auto net = std::make_shared<details::CNNNetworkImpl>();
+
+ net->setPrecision(Precision::FP32);
+ net->setName("internal_cpu_graph");
+ std::map<MKLDNNNodePtr, CNNLayerPtr> node2layer;
+
+ // Copy all nodes to network
+ for (auto &node : graph.graphNodes) {
+ auto layer = convert_node(node);
+ node2layer[node] = layer;
+ net->addLayer(layer);
+ }
+
+ // Copy all edges to network
+ for (auto &node : graph.graphNodes) {
+ auto pr = node2layer[node];
+ auto ch_edges = node->getChildEdges();
+
+ for (int i = 0; i < ch_edges.size(); i++) {
+ auto edge = node->getChildEdgeAt(i);
+ int out_port = edge->getInputNum();
+ int in_port = edge->getOutputNum();
+ auto ch_node = edge->getChild();
+ auto ch = node2layer[ch_node];
+
+ DataPtr data;
+ if (i < pr->outData.size()) {
+ std::string data_name = node->getName() + "_out" + std::to_string(i);
+ pr->outData[i] = std::make_shared<Data>(data_name, edge->getDesc());
+ data = pr->outData[i];
+ data->creatorLayer = pr;
+ } else {
+ data = pr->outData[0];
+ }
+
+ data->inputTo[ch->name] = ch;
+ ch->insData[in_port] = data;
+ }
+ }
+
+ // Specify inputs data
+ for (auto kvp : graph.inputNodes) {
+ auto in_node = kvp.second;
+ auto in_layer = node2layer[in_node];
+
+ auto in_info = std::make_shared<InputInfo>();
+ in_info->setInputData(in_layer->outData[0]);
+ net->setInputInfo(in_info);
+ }
+
+ return net;
+}
+
+void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out) {
+ auto dump_net = dump_graph_as_ie_net(graph);
+ InferenceEngine::saveGraphToDot(*dump_net, out, drawer_callback);
+}
+
+//**********************************
+// Special converters of meta data
+//**********************************
+
+static std::map<Type, std::string> type_n2l {
+ {Unknown, "Unknown"},
+ {Generic, "Unknown"},
+ {Reorder, "Reorder"},
+ {Copy, "Reorder"},
+ {Input, "Input"},
+ {Output, "Output"},
+ {Convolution, "Conv"},
+ {Deconvolution, "Deconv"},
+ {Convolution_Sum, "Conv_Eltw"},
+ {Convolution_Activation, "Conv_Activ"},
+ {Convolution_Sum_Activation, "Conv_Eltw_Activ"},
+ {Activation, "Activation"},
+ {Depthwise, "Depthwise"},
+ {Lrn, "Lrn"},
+ {Pooling, "Pool"},
+ {FullyConnected, "FC"},
+ {SoftMax, "SoftMax"},
+ {Split, "Split"},
+ {Concatenation, "Concat"},
+ {Power, "Power"},
+ {Eltwise, "Eltwise"},
+ {Crop, "Crop"},
+ {Reshape, "Reshape"},
+ {Tile, "Tile"},
+ {SimplerNMS, "Proposal"},
+ {ROIPooling, "ROIPooling"},
+ {BatchNormalization, "BatchNorm"},
+ {Flatten, "Flatten"},
+ {Permute, "Permute"},
+ {MemoryOutput, "MemoryIn"},
+ {MemoryInput, "MemoryOut"}
+};
+
+static const std::string ORIGIN_NAMES = "origin";
+static const std::string IMPL_TYPE = "impl";
+static const std::string PRECISION = "prec";
+static const std::string PERF_COUNTER = "perf";
+
+static const std::string BLUE = "#D8D9F1";
+static const std::string GREEN = "#D9EAD3";
+
+void copy_node_metadata(const MKLDNNNodePtr &node, CNNLayer::Ptr &layer) {
+ layer->type = type_n2l[node->getType()];
+ layer->name = node->getName(); // Is ID
+
+ if (node->getCnnLayer()) {
+ // Original layer names
+ std::vector<MKLDNNNodePtr> internal = node->getFusedWith();
+ auto &merged = node->getMergeWith();
+ internal.insert(internal.end(), merged.begin(), merged.end());
+
+ std::string orig_names = node->getCnnLayer()->name;
+ for (auto &sub_node : internal)
+ orig_names += " " + sub_node->getCnnLayer()->name;
+
+ layer->params[ORIGIN_NAMES] = orig_names;
+ }
+
+ // Implementation type name
+ layer->params[IMPL_TYPE] = node->getPrimitiveDescriptorType();
+
+ // Precision
+ // TODO: That is not fully correct mapping type to precision.
+ std::string precision = "FP32";
+ auto desc = node->getSelectedPrimitiveDescriptor();
+ if (desc == nullptr) {
+ THROW_IE_EXCEPTION << "Internal error - descriptor is empty";
+ }
+ impl_desc_type impl_type = desc->getImplementationType();
+
+ if (impl_type == gemm_blas &&
+ node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8) precision = "INT8";
+
+ if (impl_type & jit && impl_type & avx512 &&
+ node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8) precision = "INT8";
+
+ layer->params[PRECISION] = precision;
+
+ // Performance
+ if (node->PerfCounter().avg() != 0) {
+ layer->params[PERF_COUNTER] = std::to_string(node->PerfCounter().avg())+ " mcs";
+ }
+}
+
+void drawer_callback(const InferenceEngine::CNNLayerPtr layer,
+ ordered_properties &printed_properties,
+ ordered_properties &node_properties) {
+ const auto &params = layer->params;
+
+ // Implementation
+ auto impl = params.find(IMPL_TYPE);
+ if (impl != params.end()) {
+ printed_properties.push_back({"impl", impl->second});
+ }
+
+ // Original names
+ auto orig = params.find(ORIGIN_NAMES);
+ if (orig != params.end()) {
+ printed_properties.push_back({"originals", orig->second});
+ }
+
+ // Precision
+ auto prec = params.find(PRECISION);
+ if (prec != params.end()) {
+ printed_properties.push_back({"precision", prec->second});
+ }
+
+ // Set color
+ node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE});
+}
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
new file mode 100644
index 000000000..6ec5ffc45
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
@@ -0,0 +1,18 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_icnn_network.hpp"
+#include "mkldnn_graph.h"
+
+#include <memory>
+
+namespace MKLDNNPlugin {
+
+ void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out);
+
+ std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph);
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
index 3be1fbf23..6c88ebd6f 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -144,20 +143,27 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
}
void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
+ auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+ for (auto a : algs) {
+ if (alg == a) {
+ return true;
+ }
+ }
+ return false;
+ };
+
auto& graphNodes = graph.GetNodes();
- auto isFusingSupported = [&](MKLDNNNodePtr node) {
- if (!node->getCnnLayer())
+ auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+ if (!activation->getCnnLayer())
return false;
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
+ auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
return activationNode &&
- (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp);
+ (activationNode->getAlgorithm() == eltwise_relu ||
+ (conv->getCnnLayer()->precision == Precision::FP32 &&
+ isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
};
for (int i = 0; i < graphNodes.size(); i++) {
@@ -172,13 +178,13 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
if (conv->getChildEdges().size() == 1) {
auto ch1 = conv->getChildEdgeAt(0)->getChild();
- if (isFusingSupported(ch1)) {
+ if (isFusingSupported(conv, ch1)) {
fuse(ch1);
if (ch1->getChildEdges().size() == 1) {
auto ch2 = ch1->getChildEdgeAt(0)->getChild();
- if (isFusingSupported(ch2)) {
+ if (isFusingSupported(conv, ch2)) {
fuse(ch2);
graph.DropNode(ch2);
}
@@ -193,7 +199,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
if (is_max_pool && pool->getChildEdges().size() == 1) {
auto ch2 = pool->getChildEdgeAt(0)->getChild();
- if (isFusingSupported(ch2)) {
+ if (isFusingSupported(conv, ch2)) {
fuse(ch2);
graph.DropNode(ch2);
}
@@ -274,8 +280,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
auto isSutableChildConvolution = [](MKLDNNNodePtr node) {
auto* layer = dynamic_cast<ConvolutionLayer*>(node->getCnnLayer().get());
- auto allPads = getConvPaddings(*layer);
+ auto allPads = getPaddings(*layer);
bool isSupportedParams = layer->_out_depth == layer->_group &&
+
+ layer->_out_depth != 1 &&
+ // Depthwise convolution output should be multiple of 8
+
layer->_kernel[X_AXIS] == 3 && layer->_kernel[Y_AXIS] == 3 &&
allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 &&
layer->_dilation[X_AXIS] == 1 && layer->_dilation[Y_AXIS] == 1 &&
@@ -379,18 +389,25 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
- auto isFusingSupported = [&](MKLDNNNodePtr node) {
- if (!node->getCnnLayer())
+ auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+ for (auto a : algs) {
+ if (alg == a) {
+ return true;
+ }
+ }
+ return false;
+ };
+
+ auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+ if (!activation->getCnnLayer())
return false;
- auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
+ auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
return activationNode &&
- (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu ||
- activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp);
+ (activationNode->getAlgorithm() == eltwise_relu ||
+ (conv->getCnnLayer()->precision == Precision::FP32 &&
+ isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
};
for (auto &graphNode : graphNodes) {
@@ -411,6 +428,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
auto mergedConv = (parent1->getType() == Convolution) ? parent1 : parent2;
auto peerNode = (parent1->getType() == Convolution) ? parent2 : parent1;
+ if (peerNode->getType() == Convolution && mergedConv->getChildEdges().size() != 1) {
+ mergedConv = parent2;
+ peerNode = parent1;
+ }
auto sum = graphNode;
auto lastNode = sum;
@@ -431,7 +452,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
if (!fuse_allowed) continue;
if (graphNode->getChildEdges().size() == 1 &&
- isFusingSupported(graphNode->getChildEdgeAt(0)->getChild())) {
+ isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) {
auto relu_shared = graphNode->getChildEdgeAt(0)->getChild();
lastNode = relu_shared;
mergedConv->setType(Convolution_Sum_Activation);
@@ -472,29 +493,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
}
}
-/**
- * Convert LSTM layer format with combined state blob
- */
-void MKLDNNGraphOptimizer::SLTMTransform(MKLDNNGraph& graph) {
- auto &all_nodes = graph.GetNodes();
-
- for (auto &lstm : all_nodes) {
- if (lstm->getType() != RNN)
- continue;
-
- auto layer = lstm->getCnnLayer();
- auto in_datas = layer->insData;
- auto out_datas = layer->outData;
-
- if (in_datas.size() == 3) {
- assert(lstm->getParentEdges().size() == 3);
- // Concatenate 2 states into one blob
- // TODO: TBD
- } else if ((in_datas.size() != 1)) {
- THROW_IE_EXCEPTION << "Unsupported mode for LSTM cell. Expected two state blobs";
- }
- }
-}
void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
for (MKLDNNNodePtr& node : graph.GetNodes()) {
@@ -520,8 +518,11 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
std::set<MKLDNNNodePtr> processed;
+ std::vector<MKLDNNNodePtr> newNodes;
for (MKLDNNNodePtr& node : graph.GetNodes()) {
- if (processed.find(node) == processed.end() && node->getType() == Reorder && node->getChildEdgeAt(0)->getChild()->getType() == Reorder) {
+ if (processed.find(node) == processed.end() && node->getType() == Reorder
+ && node->getChildEdges().size() == 1
+ && node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) {
auto nextNode = node->getChildEdgeAt(0)->getChild();
MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get());
MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get());
@@ -590,10 +591,13 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
afterNode->getDesc();
graph.GetEdges().push_back(afterNode);
- graph.GetNodes().push_back(newReorder);
+ newNodes.push_back(newReorder);
graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end());
}
}
+ for (MKLDNNNodePtr& node : newNodes) {
+ graph.GetNodes().push_back(node);
+ }
}
void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
@@ -603,7 +607,7 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
auto cur = l->insData[0].lock();
if (cur == nullptr) {
- THROW_IE_EXCEPTION << "[MKLDNN] shared_ptr l->insData[0].lock() returned nullptr";
+ THROW_IE_EXCEPTION << "[MKLDNN] error - invalid input data";
}
if (cur->precision != l->outData[0]->precision) {
if (node->name.find("_iScaleShift_") != std::string::npos) {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
index d6fa323da..6818cc9ae 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
index 338ed7274..95e803925 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@@ -1,10 +1,10 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_infer_request.h"
#include "mkldnn_extension_utils.h"
+#include "mkldnn_streams.h"
#include <vector>
#include <string>
#include <map>
@@ -36,83 +36,97 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
if (!graph || !graph->IsReady()) {
THROW_IE_EXCEPTION << "Network not loaded.";
}
-
- // execute input pre-processing.
- execDataPreprocessing(_inputs);
-
- changeDefaultPtr();
- // need to retain converted blobs until infer finish
- std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
- for (auto input : _inputs) {
- if (!_networkInputs[input.first]) {
- THROW_IE_EXCEPTION <<
- "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
- << input.first;
- }
- /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) {
- THROW_IE_EXCEPTION << "Different input precision for input " << input.first
- << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. "
- << _networkInputs[input.first]->getInputPrecision() << " vs "
- << input.second->precision();
- }*/
+ auto infer = [this] {
+ // execute input pre-processing.
+ execDataPreprocessing(_inputs);
+
+ changeDefaultPtr();
+ // need to retain converted blobs until infer finish
+ std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
+ for (auto input : _inputs) {
+ if (!_networkInputs[input.first]) {
+ THROW_IE_EXCEPTION <<
+ "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
+ << input.first;
+ }
+ /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) {
+ THROW_IE_EXCEPTION << "Different input precision for input " << input.first
+ << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. "
+ << _networkInputs[input.first]->getInputPrecision() << " vs "
+ << input.second->precision();
+ }*/
- InferenceEngine::Blob::Ptr iconv;
- InferenceEngine::TBlob<float> *in_f = nullptr;
- switch (input.second->precision()) {
- case InferenceEngine::Precision::FP32:
- pushInput<float>(input.first, input.second);
- break;
- case InferenceEngine::Precision::U16:
- // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
- iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
- InferenceEngine::Precision::FP32,
- input.second->getTensorDesc().getLayout(), input.second->dims());
- convertedInputs.push_back(iconv);
- iconv->allocate();
- in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
- InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
- pushInput<float>(input.first, iconv);
- break;
- case InferenceEngine::Precision::I16:
- if (graph->hasMeanImageFor(input.first)) {
- // If a mean image exists, we convert the blob and send FP32
- iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
- InferenceEngine::Precision::FP32,
- input.second->getTensorDesc().getLayout(), input.second->dims());
- convertedInputs.push_back(iconv);
- iconv->allocate();
- in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
- InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
- pushInput<float>(input.first, iconv);
- } else {
- // Instead we can send I16 directly
- pushInput<int16_t>(input.first, input.second);
- }
- break;
- case InferenceEngine::Precision::U8:
- if (graph->hasMeanImageFor(input.first)) {
- // If a mean image exists, we convert the blob and send FP32
+ InferenceEngine::Blob::Ptr iconv;
+ InferenceEngine::TBlob<float> *in_f = nullptr;
+ switch (input.second->precision()) {
+ case InferenceEngine::Precision::FP32:
+ pushInput<float>(input.first, input.second);
+ break;
+ case InferenceEngine::Precision::I32:
+ pushInput<int32_t>(input.first, input.second);
+ break;
+ case InferenceEngine::Precision::I8:
+ pushInput<int8_t>(input.first, input.second);
+ break;
+ case InferenceEngine::Precision::U16:
+ // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
InferenceEngine::Precision::FP32,
input.second->getTensorDesc().getLayout(), input.second->dims());
convertedInputs.push_back(iconv);
iconv->allocate();
in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
- InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+ InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
pushInput<float>(input.first, iconv);
- } else {
- // Instead we can send I8 directly
- pushInput<uint8_t>(input.first, input.second);
- }
- break;
- default:
- THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+ break;
+ case InferenceEngine::Precision::I16:
+ if (graph->hasMeanImageFor(input.first)) {
+ // If a mean image exists, we convert the blob and send FP32
+ iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+ InferenceEngine::Precision::FP32,
+ input.second->getTensorDesc().getLayout(), input.second->dims());
+ convertedInputs.push_back(iconv);
+ iconv->allocate();
+ in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+ InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
+ pushInput<float>(input.first, iconv);
+ } else {
+ // Instead we can send I16 directly
+ pushInput<int16_t>(input.first, input.second);
+ }
+ break;
+ case InferenceEngine::Precision::U8:
+ if (graph->hasMeanImageFor(input.first)) {
+ // If a mean image exists, we convert the blob and send FP32
+ iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+ InferenceEngine::Precision::FP32,
+ input.second->getTensorDesc().getLayout(), input.second->dims());
+ convertedInputs.push_back(iconv);
+ iconv->allocate();
+ in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+ InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+ pushInput<float>(input.first, iconv);
+ } else {
+ // Instead we can send I8 directly
+ pushInput<uint8_t>(input.first, input.second);
+ }
+ break;
+ default:
+ THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+ }
}
- }
- graph->Infer(m_curBatch);
- graph->PullOutputData(_outputs);
+ graph->Infer(m_curBatch);
+ graph->PullOutputData(_outputs);
+ };
+#if IE_THREAD == IE_THREAD_TBB
+ auto_scope_observing observer(graph->ptrObserver);
+ // a TBB arena is made "this" for Infer call via executing lambda for the arena
+ graph->ptrArena->execute([&] { infer(); });
+#else
+ infer();
+#endif
}
void MKLDNNPlugin::MKLDNNInferRequest::GetPerformanceCounts(
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
index 313c4e045..6d88bc8d2 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
index ebbd86422..1821b88f8 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -169,8 +168,22 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
case f::OhIw16o4i:
case f::OIhw4i16o4i:
ndims = 4; break;
- case f::goihw:
+ // DHW
+ case f::ncdhw:
+ case f::ndhwc:
+ case f::nCdhw8c:
+ case f::nCdhw16c:
+ case f::oidhw:
+ case f::OIdhw8i8o:
+ case f::OIdhw16i16o:
+ case f::OIdhw8o8i:
+ case f::OIdhw16o16i:
+ case f::OIdhw8i16o2i:
+ case f::Odhwi8o:
+ case f::Odhwi16o:
+ // Group HW
case f::hwigo:
+ case f::goihw:
case f::gOIhw8i8o:
case f::gOIhw16i16o:
case f::gOIhw8i16o2i:
@@ -183,6 +196,15 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
case f::Goihw8g:
case f::Goihw16g:
ndims = 5; break;
+ case f::goidhw:
+ case f::gOIdhw8i8o:
+ case f::gOIdhw16i16o:
+ case f::gOIdhw8i16o2i:
+ case f::gOdhwi8o:
+ case f::gOdhwi16o:
+ case f::gOIdhw8o8i:
+ case f::gOIdhw16o16i:
+ ndims = 6; break;
case f::format_undef:
ndims = 0; break;
case f::any:
@@ -197,8 +219,8 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
}
bool MKLDNNMemory::IsPlainFormat(memory::format format) {
- std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::nhwc, memory::chwn,
- memory::oi, memory::io, memory::oihw, memory::ihwo,
+ std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::ncdhw, memory::nhwc, memory::ndhwc, memory::chwn,
+ memory::oi, memory::io, memory::oihw, memory::oidhw, memory::ihwo,
memory::goihw,
memory::blocked};
@@ -217,13 +239,28 @@ memory::format MKLDNNMemory::GetPlainFormat(memory::dims dims) {
return memory::x;
case 2:
return memory::nc;
+ case 3:
+ return memory::tnc;
case 4:
return memory::nchw;
+ case 5:
+ return memory::ncdhw;
default:
return memory::blocked;
}
}
+InferenceEngine::Layout MKLDNNMemory::GetPlainLayout(memory::dims dims) {
+ switch (dims.size()) {
+ case 1: return Layout::C;
+ case 2: return Layout::NC;
+ case 3: return Layout::CHW;
+ case 4: return Layout::NCHW;
+ default:
+ return Layout::BLOCKED;
+ }
+}
+
void MKLDNNMemory::CreateBlockingDesc(memory::desc &desc) {
auto dims = desc.data.dims;
int ndims = desc.data.ndims;
@@ -262,6 +299,10 @@ memory::format MKLDNNMemory::Convert(const InferenceEngine::Layout layout) {
return memory::nchw;
case NHWC:
return memory::nhwc;
+ case NCDHW:
+ return memory::ncdhw;
+ case NDHWC:
+ return memory::ndhwc;
case CHW:
return memory::tnc;
case NC:
@@ -294,6 +335,11 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
case memory::nChw8c: return "nChw8c";
case memory::nChw16c: return "nChw16c";
+ case memory::ncdhw: return "ncdhw";
+ case memory::ndhwc: return "ndhwc";
+ case memory::nCdhw8c: return "nCdhw8c";
+ case memory::nCdhw16c: return "nCdhw16c";
+
case memory::oihw: return "oihw";
case memory::ihwo: return "ihwo";
case memory::OIhw8i8o: return "OIhw8i8o";
@@ -306,8 +352,18 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
case memory::Ohwi16o: return "Ohwi16o";
case memory::OhIw16o4i: return "OhIw16o4i";
+ case memory::oidhw: return "oidhw";
+ case memory::OIdhw8i8o: return "OIdhw8i8o";
+ case memory::OIdhw16i16o: return "OIdhw16i16o";
+ case memory::OIdhw8o8i: return "OIdhw8o8i";
+ case memory::OIdhw16o16i: return "OIdhw16o16i";
+ case memory::OIdhw8i16o2i: return "OIdhw8i16o2i";
+ case memory::Odhwi8o: return "Odhwi8o";
+ case memory::Odhwi16o: return "Odhwi16o";
+
case memory::goihw: return "goihw";
case memory::hwigo: return "hwigo";
+ case memory::hwio: return "hwio";
case memory::gOIhw8i8o: return "gOIhw8i8o";
case memory::gOIhw16i16o: return "gOIhw16i16o";
case memory::gOIhw8i16o2i: return "gOIhw8i16o2i";
@@ -317,6 +373,16 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
case memory::gOIhw8o8i: return "gOIhw8o8i";
case memory::gOIhw16o16i: return "gOIhw16o16i";
case memory::gOhIw16o4i: return "gOhIw16o4i";
+
+ case memory::goidhw: return "goidhw";
+ case memory::gOIdhw8i8o: return "gOIdhw8i8o";
+ case memory::gOIdhw16i16o: return "gOIdhw16i16o";
+ case memory::gOIdhw8i16o2i: return "gOIdhw8i16o2i";
+ case memory::gOdhwi8o: return "gOdhwi8o";
+ case memory::gOdhwi16o: return "gOdhwi16o";
+ case memory::gOIdhw8o8i: return "gOIdhw8o8i";
+ case memory::gOIdhw16o16i: return "gOIdhw16o16i";
+
default: {
THROW_IE_EXCEPTION << "Unknown data format.";
}
@@ -400,66 +466,96 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
auto blkInfo = desc.data.layout_desc.blocking;
auto offset = static_cast<size_t>(blkInfo.offset_padding);
SizeVector offsetsForDims;
+ SizeVector dims = getDims().ToSizeVector();
switch (getFormat()) {
case memory::format_undef:
THROW_IE_EXCEPTION << "Cannot cast to tensor desc. Format is undefined!";
case memory::any:
layout = Layout::ANY;
- return TensorDesc(precision, getDims().ToSizeVector(), layout);
+ return TensorDesc(precision, dims, layout);
case memory::x:
layout = Layout::C;
order = {0};
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
break;
case memory::oi:
case memory::nc:
layout = Layout::NC;
order = {0, 1};
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
break;
case memory::tnc:
layout = Layout::CHW;
order = {0, 1, 2};
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
break;
case memory::ntc:
layout = Layout::CHW;
order = {1, 0, 2};
- blkDims = {static_cast<size_t>(getDims()[1]),
- static_cast<size_t>(getDims()[0]),
- static_cast<size_t>(getDims()[2])};
+ blkDims = {static_cast<size_t>(dims[1]),
+ static_cast<size_t>(dims[0]),
+ static_cast<size_t>(dims[2])};
break;
case memory::oihw:
case memory::nchw:
layout = Layout::NCHW;
order = {0, 1, 2, 3};
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
+ break;
+ case memory::ncdhw:
+ layout = Layout::NCDHW;
+ order = {0, 1, 2, 3, 4};
+ blkDims = dims;
break;
case memory::nhwc:
layout = Layout::NHWC;
order = {0, 2, 3, 1};
- blkDims = {static_cast<size_t>(getDims()[0]),
- static_cast<size_t>(getDims()[2]),
- static_cast<size_t>(getDims()[3]),
- static_cast<size_t>(getDims()[1])};
+ blkDims = {static_cast<size_t>(dims[0]),
+ static_cast<size_t>(dims[2]),
+ static_cast<size_t>(dims[3]),
+ static_cast<size_t>(dims[1])};
break;
+ case memory::ndhwc:
+ layout = Layout::NDHWC;
+ order = {0, 2, 3, 4, 1};
+ blkDims = {static_cast<size_t>(dims[0]),
+ static_cast<size_t>(dims[2]),
+ static_cast<size_t>(dims[3]),
+ static_cast<size_t>(dims[4]),
+ static_cast<size_t>(dims[1])};
+ break;
+ case memory::oIhw8i:
case memory::nChw8c:
order = {0, 1, 2, 3, 1};
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
+ blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0);
+ blkDims.push_back(8);
+ layout = Layout::BLOCKED;
+ break;
+ case memory::nCdhw8c:
+ order = {0, 1, 2, 3, 4, 1};
+ blkDims = dims;
blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0);
blkDims.push_back(8);
layout = Layout::BLOCKED;
break;
case memory::nChw16c:
order = {0, 1, 2, 3, 1};
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
+ blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0);
+ blkDims.push_back(16);
+ layout = Layout::BLOCKED;
+ break;
+ case memory::nCdhw16c:
+ order = {0, 1, 2, 3, 4, 1};
+ blkDims = dims;
blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0);
blkDims.push_back(16);
layout = Layout::BLOCKED;
break;
case memory::blocked:
order.clear();
- blkDims = getDims().ToSizeVector();
+ blkDims = dims;
for (size_t i = 0; i < blkDims.size(); i++) {
order.push_back(i);
if ((i && blkInfo.strides[0][i - 1] < blkInfo.strides[0][i]) || blkInfo.block_dims[i] != 1) {
@@ -478,14 +574,14 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
SizeVector strides(blkDims.size());
- if (layout == Layout::NHWC || layout == Layout::CHW) {
+ if (layout == Layout::NHWC || layout == Layout::NDHWC || layout == Layout::CHW) {
for (size_t i = 0; i < order.size(); i++) {
strides[i] = static_cast<size_t>(blkInfo.strides[0][order[i]]);
}
} else {
strides[blkDims.size() - 1] = 1;
for (size_t i = 2; i <= order.size(); i++) {
- if (blkDims.size() - i < getDims().ndims()) {
+ if (blkDims.size() - i < dims.size()) {
strides[blkDims.size() - i] = static_cast<size_t>(blkInfo.strides[0][order[blkDims.size() - i]]);
} else {
strides[blkDims.size() - i] = strides[blkDims.size() - i + 1] * blkDims[blkDims.size() - i + 1];
@@ -494,13 +590,13 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
}
for (size_t i = 0; i < blkDims.size() && i < TENSOR_MAX_DIMS; i++) {
- if (i < getDims().ndims())
+ if (i < dims.size())
offsetsForDims.push_back(blkInfo.offset_padding_to_data[i]);
else
offsetsForDims.push_back(0);
}
- TensorDesc tensorDesc(precision, getDims().ToSizeVector(), {blkDims, order, offset, offsetsForDims, strides});
+ TensorDesc tensorDesc(precision, dims, {blkDims, order, offset, offsetsForDims, strides});
tensorDesc.setLayout(layout);
return tensorDesc;
@@ -543,9 +639,15 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
case NCHW:
mkldnnFormat = memory::format::nchw;
break;
+ case NCDHW:
+ mkldnnFormat = memory::format::ncdhw;
+ break;
case NHWC:
mkldnnFormat = memory::format::nhwc;
break;
+ case NDHWC:
+ mkldnnFormat = memory::format::ndhwc;
+ break;
case OIHW:
mkldnnFormat = memory::format::oihw;
break;
@@ -553,6 +655,11 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
mkldnnFormat = memory::format::x;
break;
case CHW:
+ if (order == SizeVector{0, 1, 2})
+ mkldnnFormat = memory::format::tnc;
+ else if (order == SizeVector{1, 0, 2})
+ mkldnnFormat = memory::format::ntc;
+ else
mkldnnFormat = memory::format::blocked;
break;
case HW:
@@ -560,32 +667,41 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
mkldnnFormat = memory::format::nc;
break;
case BLOCKED:
+ mkldnnFormat = memory::format::blocked;
if (realDims.ndims() == 1) {
mkldnnFormat = memory::format::x;
- break;
} else if (realDims.ndims() == 2) {
mkldnnFormat = memory::format::nc;
- break;
} else if (realDims.ndims() == 4) {
if (order.size() == 5 && order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 1) {
if (blkdDims[4] == 8) {
mkldnnFormat = memory::format::nChw8c;
- break;
} else if (blkdDims[4] == 16) {
mkldnnFormat = memory::format::nChw16c;
- break;
}
} else if (order.size() == 4) {
if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3) {
mkldnnFormat = memory::format::nchw;
- break;
} else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 1) {
mkldnnFormat = memory::format::nhwc;
- break;
+ }
+ }
+ } else if (realDims.ndims() == 5) {
+ if (order.size() == 6 &&
+ order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4 && order[5] == 1) {
+ if (blkdDims[5] == 8) {
+ mkldnnFormat = memory::format::nCdhw8c;
+ } else if (blkdDims[5] == 16) {
+ mkldnnFormat = memory::format::nCdhw16c;
+ }
+ } else if (order.size() == 5) {
+ if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4) {
+ mkldnnFormat = memory::format::ncdhw;
+ } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 4 && order[4] == 1) {
+ mkldnnFormat = memory::format::ndhwc;
}
}
}
- mkldnnFormat = memory::format::blocked;
break;
case CN:
mkldnnFormat = memory::format::blocked;
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
index a5329ee5f..37578e5ff 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -108,6 +107,7 @@ public:
static bool IsPlainFormat(mkldnn::memory::format format);
static mkldnn::memory::format GetPlainFormat(mkldnn::memory::dims dims);
+ static InferenceEngine::Layout GetPlainLayout(mkldnn::memory::dims dims);
static bool isConsistant(mkldnn::memory::dims dims, mkldnn::memory::format format);
static mkldnn::memory::format Convert(const InferenceEngine::Layout layout);
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index 7bda59d0d..73975b71e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -10,6 +9,8 @@
#include <vector>
#include <string>
#include <limits>
+#include <cstdint>
+#include <unordered_map>
#include <nodes/mkldnn_batchnorm_node.h>
#include <nodes/mkldnn_concat_node.h>
@@ -17,6 +18,7 @@
#include <nodes/mkldnn_crop_node.h>
#include <nodes/mkldnn_deconv_node.h>
#include <nodes/mkldnn_eltwise_node.h>
+#include <nodes/mkldnn_gemm_node.h>
#include <nodes/mkldnn_fullyconnected_node.h>
#include <nodes/mkldnn_generic_node.h>
#include <nodes/mkldnn_input_node.h>
@@ -35,8 +37,9 @@
#include <nodes/mkldnn_memory_node.hpp>
#include <nodes/mkldnn_rnn.h>
#include <mkldnn_types.h>
-
#include "mkldnn_extension_utils.h"
+#include "mkldnn_plugin.h"
+#include "ie_memcpy.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
@@ -52,6 +55,7 @@ MKLDNNNode::Register<MKLDNNConvolutionNode> MKLDNNConvolutionNode::reg;
MKLDNNNode::Register<MKLDNNCropNode> MKLDNNCropNode::reg;
MKLDNNNode::Register<MKLDNNDeconvolutionNode> MKLDNNDeconvolutionNode::reg;
MKLDNNNode::Register<MKLDNNEltwiseNode> MKLDNNEltwiseNode::reg;
+MKLDNNNode::Register<MKLDNNGemmNode> MKLDNNGemmNode::reg;
MKLDNNNode::Register<MKLDNNFullyConnectedNode> MKLDNNFullyConnectedNode::reg;
MKLDNNNode::Register<MKLDNNInputNode> MKLDNNInputNode::reg;
MKLDNNNode::Register<MKLDNNLrnNode> MKLDNNLrnNode::reg;
@@ -358,6 +362,8 @@ std::vector<memory::format> MKLDNNNode::getAvailableFormatsForDims(const MKLDNND
return {memory::format::nc};
else if (dims.ndims() == 4)
return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c};
+ else if (dims.ndims() == 5)
+ return {memory::format::ncdhw, memory::format::nCdhw8c, memory::format::nCdhw16c};
return {memory::format::any};
}
@@ -506,7 +512,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
size_t offset = blb->byteSize();
checkSize(intBuffSize, offset);
- memcpy(data, blb->buffer(), blb->byteSize());
+ ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize());
data += blb->byteSize();
for (const auto &merged : getMergeWith()) {
wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(merged->getCnnLayer().get());
@@ -519,7 +525,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
THROW_IE_EXCEPTION << "Cannot get internal blob layer for node " << getName() << ".";
offset += blb->byteSize();
checkSize(intBuffSize, offset);
- memcpy(data, blb->buffer(), blb->byteSize());
+ ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize());
data += blb->byteSize();
}
@@ -545,13 +551,32 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri
internalBlobMemory.clear();
for (size_t i = 0; i < internalBlobs.size(); i++) {
- auto& internalBlob = internalBlobs[i];
- internalBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(engine)));
-
- internalBlobMemory[i]->Create(intDescs[i]);
- MKLDNNMemory memory(engine);
- memory.Create(MKLDNNMemoryDesc(internalBlob->getTensorDesc()), internalBlob->buffer());
- internalBlobMemory[i]->SetData(memory);
+ const auto &internalBlob = internalBlobs[i];
+
+ const uint64_t data_hash = Engine::GetWeightsSharing().GetHashFunc().hash(internalBlob->buffer(), internalBlob->byteSize());
+ const std::string string_hash = name + "_" + std::to_string(i)
+ + "_" + std::to_string(internalBlob->byteSize())
+ + "_" + std::to_string(data_hash);
+ MKLDNNMemoryPtr ptr =
+ Engine::GetWeightsSharing().findOrCreate(string_hash, [&] () {
+ MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(engine));
+ _ptr->Create(intDescs[i]);
+ MKLDNNMemory memory(engine);
+
+ auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
+ auto newFormat = newDesc.getFormat();
+ if (newFormat == mkldnn::memory::ncdhw) {
+ newFormat = mkldnn::memory::goihw;
+ }
+ if (newFormat == mkldnn::memory::nchw) {
+ newFormat = mkldnn::memory::oihw;
+ }
+ memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
+ auto aformat = memory.GetFormat();
+ _ptr->SetData(memory);
+ return _ptr;
+ });
+ internalBlobMemory.push_back(ptr);
}
}
@@ -648,6 +673,8 @@ std::string MKLDNNNode::typeToStr(Type type) {
return "Pooling";
case FullyConnected:
return "FullyConnected";
+ case Gemm:
+ return "Gemm";
case SoftMax:
return "SoftMax";
case Split:
@@ -682,6 +709,9 @@ std::string MKLDNNNode::typeToStr(Type type) {
return "MemoryInput";
case RNN:
return "RNN";
+ case LSTMCell:
+ return "LSTMCell";
+
default:
return "Unknown";
}
@@ -838,17 +868,18 @@ InferenceEngine::TensorDesc MKLDNNNode::getConfiguredOutputDesc(const InferenceE
void MKLDNNNode::initOptimalPrimitiveDescriptor() {
auto config = getSelectedPrimitiveDescriptor()->getConfig();
- if (isInitConfig(config))
- return;
-
- for (size_t i = 0; i < config.inConfs.size(); i++) {
- config.inConfs[i].desc = getConfiguredInputDesc(config, i);
- }
+ if (!isInitConfig(config)) {
+ for (size_t i = 0; i < config.inConfs.size(); i++) {
+ config.inConfs[i].desc = getConfiguredInputDesc(config, i);
+ }
- for (size_t i = 0; i < config.outConfs.size(); i++) {
- config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+ for (size_t i = 0; i < config.outConfs.size(); i++) {
+ config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+ }
+ initDescriptor(config);
+ } else if (getType() != RNN && getType() != LSTMCell) {
+ initDescriptor(config);
}
- initDescriptor(config);
}
bool MKLDNNNode::isInitConfig(const InferenceEngine::LayerConfig& config) const {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index acfe8e167..fe71c665f 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -49,6 +48,7 @@ enum Type {
Concatenation,
Power,
Eltwise,
+ Gemm,
Crop,
Reshape,
Tile,
@@ -60,6 +60,7 @@ enum Type {
Copy,
MemoryOutput,
MemoryInput,
+ LSTMCell,
RNN
};
@@ -86,6 +87,7 @@ static Type TypeFromName(const std::string type) {
{ "Pooling", Pooling },
{ "FullyConnected", FullyConnected },
{ "InnerProduct", FullyConnected },
+ { "Gemm", Gemm },
{ "Softmax", SoftMax },
{ "SoftMax", SoftMax },
{ "Split", Split },
@@ -103,6 +105,7 @@ static Type TypeFromName(const std::string type) {
{ "Flatten", Flatten },
{ "Permute", Permute },
{ "Copy", Copy },
+ { "LSTMCell", LSTMCell },
{ "RNN", RNN },
{ "MemoryInput", MemoryInput}, // for construction from name ctor, arbitrary name is used
{ "Memory", MemoryOutput }, // for construction from layer ctor
@@ -191,6 +194,10 @@ public:
return mergedWith;
}
+ const std::vector <MKLDNNNodePtr> &getFusedWith() {
+ return fusedWith;
+ }
+
const std::string getName() const {
return name;
}
@@ -317,7 +324,7 @@ protected:
this->type = type;
}
- int getMaxBatch();
+ virtual int getMaxBatch();
virtual InferenceEngine::TensorDesc getConfiguredInputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const;
virtual InferenceEngine::TensorDesc getConfiguredOutputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const;
@@ -350,6 +357,8 @@ protected:
MKLDNNPrimitive prim;
std::vector<MKLDNNDescriptor> descs;
+ InferenceEngine::Blob::Ptr ext_scales;
+
friend class MKLDNNEdge;
friend class MKLDNNGraph;
friend class MKLDNNGraphOptimizer;
@@ -371,8 +380,9 @@ protected:
public:
Register() {
Registry::RegisterNode(
- Registry::CreatorByLayerFunction([](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) -> MKLDNNNode * {
- return new To(layer, eng); } ) );
+ Registry::CreatorByLayerFunction(
+ [](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng)
+ -> MKLDNNNode* { return new To(layer, eng); } ) );
}
};
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
index 3b51c974f..35a965afa 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -11,6 +10,9 @@
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
+MKLDNNWeightsSharing Engine::weightsSharing;
+const SimpleDataHash MKLDNNWeightsSharing::simpleCRC;
+
InferenceEngine::ExecutableNetworkInternal::Ptr
Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config) {
auto specifiedDevice = network.getTargetDevice();
@@ -25,8 +27,12 @@ Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map
network.getInputsInfo(_networkInputs);
for (auto ii : _networkInputs) {
auto input_precision = ii.second->getInputPrecision();
- if (input_precision != InferenceEngine::Precision::U16 && input_precision != InferenceEngine::Precision::I16
- && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8) {
+ if (input_precision != InferenceEngine::Precision::FP32 &&
+ input_precision != InferenceEngine::Precision::I32 &&
+ input_precision != InferenceEngine::Precision::U16 &&
+ input_precision != InferenceEngine::Precision::I16 &&
+ input_precision != InferenceEngine::Precision::I8 &&
+ input_precision != InferenceEngine::Precision::U8) {
THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
<< "Input image format " << input_precision << " is not supported yet...";
}
@@ -86,7 +92,7 @@ void Engine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string
INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin*& plugin, ResponseDesc *resp) noexcept {
try {
plugin = make_ie_compatible_plugin(
- {{1, 4},
+ {{1, 5},
#ifdef MKL_VERSION
MKL_VERSION,
#else
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
index 482405a16..383feaa21 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -8,11 +7,59 @@
#include "mkldnn_graph.h"
#include <string>
#include <map>
+#include <unordered_map>
#include <memory>
+#include <functional>
#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
namespace MKLDNNPlugin {
+class SimpleDataHash {
+public:
+ SimpleDataHash() {
+ for (int i = 0; i < kTableSize; i++) {
+ uint64_t c = i;
+ for (int j = 0; j < 8; j++)
+ c = ((c & 1) ? 0xc96c5795d7870f42 : 0) ^ (c >> 1);
+ table[i] = c;
+ }
+ }
+ // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
+ uint64_t hash(const unsigned char* data, size_t size) const {
+ uint64_t crc = 0;
+ for (size_t idx = 0; idx < size; idx++)
+ crc = table[(unsigned char)crc ^ data[idx]] ^ (crc >> 8);
+
+ return ~crc;
+ }
+
+protected:
+ static const int kTableSize = 256;
+ uint64_t table[kTableSize];
+};
+
+class MKLDNNWeightsSharing {
+public:
+ MKLDNNMemoryPtr findOrCreate(const std::string& name_hash,
+ std::function<MKLDNNMemoryPtr(void)> create) {
+ std::unique_lock<std::mutex> lock(guard);
+ auto found = sharedWeights.find(name_hash);
+
+ MKLDNNMemoryPtr ptr;
+ if (found == sharedWeights.end() || !(ptr = found->second.lock())) {
+ ptr = create();
+ sharedWeights[name_hash] = ptr;
+ }
+ return ptr;
+ }
+ static const SimpleDataHash& GetHashFunc () { return simpleCRC; }
+
+protected:
+ std::unordered_map<std::string, std::weak_ptr<MKLDNNMemory>> sharedWeights;
+ std::mutex guard;
+ static const SimpleDataHash simpleCRC;
+};
+
class Engine : public InferenceEngine::InferencePluginInternal {
public:
Engine() = default;
@@ -30,16 +77,20 @@ public:
void SetConfig(const std::map<std::string, std::string> &config) override;
/**
- * @depricated Use the version with config parameter
+ * @deprecated Use the version with config parameter
*/
void QueryNetwork(const InferenceEngine::ICNNNetwork& network, InferenceEngine::QueryNetworkResult& res) const override;
void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
const std::map<std::string, std::string>& config, InferenceEngine::QueryNetworkResult& res) const override;
+ static MKLDNNWeightsSharing& GetWeightsSharing() { return weightsSharing; }
private:
Config engConfig;
MKLDNNExtensionManager::Ptr extensionManager = std::make_shared<MKLDNNExtensionManager>();
+
+protected:
+ static MKLDNNWeightsSharing weightsSharing;
};
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
index 6fa73c52f..f9e59f2cc 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
index 5bf983496..075afff9e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp
new file mode 100644
index 000000000..a5198377c
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp
@@ -0,0 +1,372 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <map>
+#include <vector>
+#include <limits>
+#include <chrono>
+#include <climits>
+#include <memory>
+
+#include "mkldnn_graph.h"
+#include "ie_parallel.hpp"
+#include "mkldnn_streams.h"
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+namespace MKLDNNPlugin {
+
+thread_local MultiWorkerTaskContext MultiWorkerTaskExecutor::ptrContext;
+
+bool check_env_variables() {
+#if IE_THREAD == IE_THREAD_OMP
+ return MKLDNNPlugin::cpu::checkOpenMpEnvVars(false);
+#else
+ return false;
+#endif
+}
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+/* Get the cores affinity mask for the current process */
+bool get_process_mask(int& ncpus, cpu_set_t*& mask) {
+ for (ncpus = sizeof(cpu_set_t) / CHAR_BIT; ncpus < 1024 /* reasonable limit of #cores*/; ncpus <<= 1) {
+ mask = CPU_ALLOC(ncpus);
+ if (!mask) return false;
+
+ const size_t size = CPU_ALLOC_SIZE(ncpus);
+ CPU_ZERO_S(size, mask);
+ const int err = sched_getaffinity(getpid(), size, mask);
+ // the result fits the mask
+ if (!err) break;
+ // mask size is not enough
+ CPU_FREE(mask);
+ mask = NULL;
+ // other error
+ if (errno != EINVAL) break;
+ }
+ if (!mask) {
+ return false;
+ }
+ return true;
+}
+/* Pin current thread to a set of cores determined by the mask. */
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) {
+ return 0 == sched_setaffinity(0, ncores, proc_mask);
+}
+/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask.
+ * The function can also handle the hyper-threading (by populating the physical cores first) */
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) {
+ const size_t size = CPU_ALLOC_SIZE(ncores);
+ const int num_cpus = CPU_COUNT_S(size, proc_mask);
+ thr_idx %= num_cpus; // To limit unique number in [; num_cpus-1] range
+
+ // Place threads with specified step
+ int cpu_idx = 0;
+ for (int i = 0, offset = 0; i < thr_idx; ++i) {
+ cpu_idx += hyperthreads;
+ if (cpu_idx >= num_cpus)
+ cpu_idx = ++offset;
+ }
+
+ // Find index of 'cpu_idx'-th bit that equals to 1
+ int mapped_idx = -1;
+ while (cpu_idx >= 0) {
+ if (CPU_ISSET_S(++mapped_idx, size, proc_mask))
+ --cpu_idx;
+ }
+
+ cpu_set_t *target_mask = CPU_ALLOC(ncores);
+ CPU_ZERO_S(size, target_mask);
+ CPU_SET_S(mapped_idx, size, target_mask);
+ bool res = pin_current_thread_by_mask(size, target_mask);
+ CPU_FREE(target_mask);
+ return res;
+}
+#else // no threads pinning/binding on Win/MacOS
+bool get_process_mask(int& ncpus, cpu_set_t*& mask) {
+ ncpus = 0;
+ mask = nullptr;
+ return false;
+}
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) {
+ return false;
+}
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) {
+ return false;
+}
+#endif // !(defined(__APPLE__) || defined(_WIN32))
+
+MultiWorkerTaskExecutor::MultiWorkerTaskExecutor(const std::vector<Task::Ptr>& init_tasks, std::string name) :
+ _isStopped(false), _name(name), _initCount(0) {
+ for (auto t : init_tasks) {
+ _threads.push_back(std::thread([&, t] {
+ // initialization (no contention, every worker thread is doing it's own task)
+ t->runNoThrowNoBusyCheck();
+ _initCount++;
+
+ while (!_isStopped) {
+ bool isQueueEmpty;
+ Task::Ptr currentTask = nullptr;
+ { // waiting for the new task or for stop signal
+ std::unique_lock<std::mutex> lock(_queueMutex);
+ _queueCondVar.wait(lock, [&]() { return !_taskQueue.empty() || _isStopped; });
+ isQueueEmpty = _taskQueue.empty();
+ if (!isQueueEmpty) {
+ currentTask = _taskQueue.front();
+ _taskQueue.pop();
+ isQueueEmpty = _taskQueue.empty();
+ }
+ }
+ if (currentTask)
+ currentTask->runNoThrowNoBusyCheck();
+ if (_isStopped)
+ break;
+ if (isQueueEmpty) // notify dtor, that all tasks were completed
+ _queueCondVar.notify_all();
+ }
+ }));
+ }
+ while (_initCount != init_tasks.size()) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+ }
+}
+
+MultiWorkerTaskExecutor::~MultiWorkerTaskExecutor() {
+ {
+ std::unique_lock<std::mutex> lock(_queueMutex);
+ if (!_taskQueue.empty()) {
+ _queueCondVar.wait(lock, [this]() { return _taskQueue.empty(); });
+ }
+ _isStopped = true;
+ _queueCondVar.notify_all();
+ }
+ for (auto& thread : _threads) {
+ if (thread.joinable()) {
+ thread.join();
+ }
+ }
+}
+
+bool MultiWorkerTaskExecutor::startTask(Task::Ptr task) {
+ if (!task->occupy()) return false;
+ std::unique_lock<std::mutex> lock(_queueMutex);
+ _taskQueue.push(task);
+ _queueCondVar.notify_one();
+ return true;
+}
+
+MKLDNNPlugin::MKLDNNGraphlessInferRequest::MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs,
+ InferenceEngine::OutputsDataMap networkOutputs)
+ : InferRequestInternal(networkInputs, networkOutputs), m_curBatch(-1) {
+ // Allocate all input blobs
+ for (const auto& it : networkInputs) {
+ InferenceEngine::Blob::Ptr blob;
+ GetBlob(it.first.c_str(), blob);
+ }
+ // Allocate all output blobs
+ for (const auto& it : networkOutputs) {
+ InferenceEngine::Blob::Ptr blob;
+ GetBlob(it.first.c_str(), blob);
+ }
+}
+
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() {
+ IE_PROFILING_AUTO_SCOPE(MKLDNN_INFER)
+
+ auto infer = [this] {
+ IE_ASSERT(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph != nullptr);
+ MKLDNNGraph::Ptr graph = MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph;
+ if (!graph->IsReady())
+ THROW_IE_EXCEPTION << "Network not loaded.";
+ if (m_curBatch > 0 && !graph->getProperty().enableDynamicBatch)
+ THROW_IE_EXCEPTION << "Dynamic batch is not enabled.";
+
+ if (m_curBatch > graph->getProperty().batchLimit)
+ THROW_IE_EXCEPTION << "Invalid dynamic batch size " << m_curBatch <<
+ " for this request.";
+
+ // execute input pre-processing.
+ execDataPreprocessing(_inputs);
+
+ // need to retain converted blobs until infer finish
+ std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
+ for (auto input : _inputs) {
+ if (!_networkInputs[input.first]) {
+ THROW_IE_EXCEPTION <<
+ "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
+ << input.first;
+ }
+ InferenceEngine::Blob::Ptr iconv;
+ InferenceEngine::TBlob<float> *in_f = nullptr;
+ switch (input.second->precision()) {
+ case InferenceEngine::Precision::FP32:
+ graph->PushInputData(input.first, input.second);
+ break;
+ case InferenceEngine::Precision::U16:
+ // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
+ iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+ InferenceEngine::Precision::FP32,
+ input.second->getTensorDesc().getLayout(), input.second->dims());
+ convertedInputs.push_back(iconv);
+ iconv->allocate();
+ in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+ InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
+ graph->PushInputData(input.first, iconv);
+ break;
+ case InferenceEngine::Precision::I16:
+ if (graph->hasMeanImageFor(input.first)) {
+ // If a mean image exists, we convert the blob and send FP32
+ iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+ InferenceEngine::Precision::FP32,
+ input.second->getTensorDesc().getLayout(), input.second->dims());
+ convertedInputs.push_back(iconv);
+ iconv->allocate();
+ in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+ InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
+ graph->PushInputData(input.first, iconv);
+ } else {
+ // Instead we can send I16 directly
+ graph->PushInputData(input.first, input.second);
+ }
+ break;
+ case InferenceEngine::Precision::U8:
+ if (graph->hasMeanImageFor(input.first)) {
+ // If a mean image exists, we convert the blob and send FP32
+ iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+ InferenceEngine::Precision::FP32,
+ input.second->getTensorDesc().getLayout(), input.second->dims());
+ convertedInputs.push_back(iconv);
+ iconv->allocate();
+ in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+ InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+ graph->PushInputData(input.first, iconv);
+ } else {
+ // Instead we can send I8 directly
+ graph->PushInputData(input.first, input.second);
+ }
+ break;
+ default:
+ THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+ }
+ }
+ graph->Infer(m_curBatch);
+ graph->PullOutputData(_outputs);
+ if (graph->getProperty().collectPerfCounters) {
+ m_perfMap.clear();
+ graph->GetPerfData(m_perfMap);
+ }
+ };
+#if IE_THREAD == IE_THREAD_TBB
+ auto_scope_observing observer(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrObserver);
+ // a TBB arena is made "this" for Infer call via executing lambda for the arena
+ MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrArena->execute([&] { infer(); });
+#else
+ infer();
+#endif
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetPerformanceCounts(
+ std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const {
+ perfMap = m_perfMap;
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) {
+ // ROI blob is returned only if it was set previously.
+ auto it = _preProcData.find(name);
+ if (it != _preProcData.end()) {
+ data = it->second.getRoiBlob();
+ return;
+ }
+
+ if (_inputs.find(name) != _inputs.end()) {
+ data = _inputs[name];
+ checkBlob(data, name, true);
+ return;
+ } else if (_networkInputs.find(name) != _networkInputs.end()) {
+ InferenceEngine::Layout l = _networkInputs[name]->getLayout();
+ InferenceEngine::Precision p = _networkInputs[name]->getPrecision();
+ InferenceEngine::SizeVector dims = _networkInputs[name]->getTensorDesc().getDims();
+
+ InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l);
+ _inputs[name] = data = make_blob_with_precision(desc);
+ _inputs[name]->allocate();
+ checkBlob(data, name, true);
+ return;
+ }
+
+ if (_outputs.find(name) != _outputs.end()) {
+ data = _outputs[name];
+ checkBlob(data, name, false);
+ return;
+ } else if (_networkOutputs.find(name) != _networkOutputs.end()) {
+ InferenceEngine::Layout l = _networkOutputs[name]->getLayout();
+ InferenceEngine::Precision p = _networkOutputs[name]->getPrecision();
+ InferenceEngine::SizeVector dims = _networkOutputs[name]->getTensorDesc().getDims();
+
+ InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l);
+ _outputs[name] = data = make_blob_with_precision(desc);
+ _outputs[name]->allocate();
+ checkBlob(data, name, false);
+ return;
+ }
+
+ THROW_IE_EXCEPTION << "Cannot find blob with name: " << name;
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) {
+ if (!data)
+ THROW_IE_EXCEPTION << NOT_ALLOCATED_str << "Failed to set empty blob with name: \'" << name << "\'";
+ if (data->buffer() == nullptr)
+ THROW_IE_EXCEPTION << "Input data was not allocated. Input name: \'" << name << "\'";
+ if (name == nullptr) {
+ THROW_IE_EXCEPTION << NOT_FOUND_str + "Failed to set blob with empty name";
+ }
+ InferenceEngine::InputInfo::Ptr foundInput;
+ InferenceEngine::DataPtr foundOutput;
+ size_t dataSize = data->size();
+ if (findInputAndOutputBlobByName(name, foundInput, foundOutput)) {
+ if (foundInput->getInputPrecision() != data->precision()) {
+ THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Failed to set Blob with precision "
+ << data->precision();
+ }
+
+ if (foundInput->getPreProcess().getResizeAlgorithm() != InferenceEngine::ResizeAlgorithm::NO_RESIZE) {
+ // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing.
+ _preProcData[name].setRoiBlob(data);
+ } else {
+ size_t inputSize = InferenceEngine::details::product(foundInput->getDims());
+ if (dataSize != inputSize) {
+ THROW_IE_EXCEPTION << "Input blob size is not equal network input size ("
+ << dataSize << "!=" << inputSize << ").";
+ }
+ _inputs[name] = data;
+ }
+ } else {
+ size_t outputSize = InferenceEngine::details::product(foundOutput->getDims());
+ if (dataSize != outputSize) {
+ THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
+ << dataSize << "!=" << outputSize << ").";
+ }
+ if (foundOutput->getPrecision() != data->precision()) {
+ THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str
+ << "Failed to set Blob with precision not corresponding to user output precision";
+ }
+ _outputs[name] = data;
+ }
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBatch(int new_batch) {
+ if (new_batch < 1) {
+ THROW_IE_EXCEPTION << "Invalid dynamic batch size " << new_batch <<
+ " for this request.";
+ }
+ m_curBatch = new_batch;
+}
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.h b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h
new file mode 100644
index 000000000..31558fee2
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h
@@ -0,0 +1,177 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <atomic>
+#include <map>
+#include <queue>
+#include <memory>
+#include <climits>
+#include <cpp_interfaces/impl/ie_infer_request_internal.hpp>
+#include <cpp_interfaces/ie_task_executor.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn/omp_manager.h"
+
+/* CPU "streams" implement a feature that allows multiple Infer Requests to be efficiently run simultaneously.
+ * To avoid potential oversubscription the CPU execution resources are divided accordingly.
+ * The feature enables much better performance for the networks that originally do not scale well with #threads
+ * even for a large batches. Examples are lightweight topologies or topologies with many sequential/mem-bound/etc or
+ * otherwise non-scalable layers. This is especially pronounced for many-core (e.g. server) machines.
+ * This is rather throughput-oriented feature,because running multiple requests in parallel might increase the latency
+ * of each request.
+ * Additionally, the streams help to relax the need for the large batch to improve the throughput and simplify the
+ * application logic, helping to saturate the CPU by multiple requests instead.
+ * Implementation-wise, the "streams" constitute the following:
+ * - Pure "graph-less" Infer Requests that are not connected to the specific MKLDNNGraph (which is regular/legacy approach)
+ * - Just like regular requests, the graph-less go to the common (per ExecutableNetwork) queue
+ * - But unlike conventional case, there are multiple threads that grab the requests (see MultiWorkerTaskExecutor)
+ * - So every stream is in fact is independent "worker" thread that monitors the queue.
+ * - Every worker thread (stream) has it's own copy of the graph (which handles intermediate data required for execution)
+ * - While the Infer Requests just keep only input/output data
+*/
+namespace MKLDNNPlugin {
+
+using namespace InferenceEngine;
+class MKLDNNGraph;
+class pinning_observer;
+
+/* This structure handles an "execution context" - data required to execute an Infer Request.
+ * This includes graph (which handles the intermediate data) and arena/observer for the TBB */
+struct MultiWorkerTaskContext {
+ std::shared_ptr<MKLDNNGraph> ptrGraph;
+};
+
+#if defined(__APPLE__) || defined(_WIN32)
+typedef void cpu_set_t;
+#define CPU_FREE(cpuset)
+// notice that functions below are just stubs for OSs other than Linux
+#endif
+/* Check whether any affinity-related env variables are set (relevant for the OpenMP) */
+bool check_env_variables();
+/* Get the cores affinity mask for the current process */
+bool get_process_mask(int& ncpus, cpu_set_t*& mask);
+/* Pin current thread to a set of cores determined by the mask. */
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask);
+/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask.
+ * The function can also handle the hyper-threading (by populating the physical cores first) */
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask);
+
+#if IE_THREAD == IE_THREAD_TBB
+/* Simple observer that handles pinning threads to the cores, it serves as a callback for threads entering the arena. */
+class pinning_observer: public tbb::task_scheduler_observer {
+ cpu_set_t *mask;
+ int ncpus;
+ int stream_id, threads_per_stream;
+ const int pinning_step;
+
+public:
+ pinning_observer(tbb::task_arena& _arena, int _stream_id, int _threads_per_stream, int _pinning_step = 1) :
+ tbb::task_scheduler_observer(_arena),
+ stream_id(_stream_id), threads_per_stream(_threads_per_stream), pinning_step(_pinning_step) {
+ get_process_mask(ncpus, mask);
+ }
+
+ void on_scheduler_entry(bool) override {
+ if (!mask) return;
+ int thread_idx = tbb::task_arena::current_thread_index();
+ int thr_idx = stream_id * threads_per_stream + thread_idx;
+ // pin thread to the vacant slot
+ pin_thread_to_vacant_core(thr_idx, pinning_step, ncpus, mask);
+ }
+
+ void on_scheduler_exit(bool) override {
+ if (!mask) return;
+ // reset the thread's mask (to the original process mask)
+ pin_current_thread_by_mask(ncpus, mask);
+ }
+
+ ~pinning_observer() {
+ if (mask)
+ CPU_FREE(mask);
+ }
+};
+
+class auto_scope_observing {
+public:
+ explicit auto_scope_observing(std::unique_ptr<tbb::task_scheduler_observer>& _p) : p(_p) {
+ if (p)
+ p->observe(true);
+ }
+ ~auto_scope_observing() {
+ if (p)
+ p->observe(false);
+ }
+
+protected:
+ std::unique_ptr<tbb::task_scheduler_observer>& p;
+};
+#endif // IE_THREAD == IE_THREAD_TBB
+
+/* Class wrapping multiple worker threads that monitors the same queue with Infer Requests. */
+class MultiWorkerTaskExecutor : public ITaskExecutor {
+public:
+ typedef std::shared_ptr<MultiWorkerTaskExecutor> Ptr;
+
+ explicit MultiWorkerTaskExecutor(const std::vector<Task::Ptr>&, std::string name = "Default");
+
+ ~MultiWorkerTaskExecutor();
+
+ /**
+ * @brief Adds task for execution and notifies one of the working threads about the new task.
+ * @note can be called from multiple threads - tasks will be added to the queue and executed one-by-one in FIFO mode.
+ * @param task - shared pointer to the task
+ * @return true if succeed to add task, otherwise - false
+ */
+ bool startTask(Task::Ptr task) override;
+
+ static thread_local MultiWorkerTaskContext ptrContext;
+
+private:
+ std::vector<std::thread> _threads;
+ std::mutex _queueMutex;
+ std::condition_variable _queueCondVar;
+ std::queue<Task::Ptr> _taskQueue;
+ std::atomic<bool> _isStopped;
+ std::string _name;
+ std::atomic<int> _initCount;
+};
+
+/* Pure Infer Requests - just input and output data. */
+class MKLDNNGraphlessInferRequest : public InferenceEngine::InferRequestInternal {
+public:
+ typedef std::shared_ptr<MKLDNNGraphlessInferRequest> Ptr;
+ explicit MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs,
+ InferenceEngine::OutputsDataMap networkOutputs);
+
+ void InferImpl() override;
+
+ void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
+
+ /**
+ * @brief Given optional implementation of setting blob to avoid need for it to be implemented by plugin
+ * @param name - a name of input or output blob.
+ * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size.
+ */
+ void SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) override;
+
+ /**
+ * @brief Given optional implementation of getting blob to avoid need for it to be implemented by plugin
+ * @param name - a name of input or output blob.
+ * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size.
+ */
+ void GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) override;
+
+
+ void SetBatch(int batch = -1) override;
+
+private:
+ int m_curBatch;
+ std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> m_perfMap;
+};
+
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
index f48ada45e..d23b12e3b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -16,6 +15,7 @@ using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace InferenceEngine::details;
+// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu
caseless_map<std::string, std::function<void(GenericLayer*, mkldnn::algorithm&, float&, float&)>> MKLDNNActivationNode::initializers = {
{"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
index 508d8c796..9dac1507c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
index 502a804d3..173df1c24 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
@@ -1,11 +1,11 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_batchnorm_node.h"
#include "mkldnn_depthwise_node.h"
#include <mkldnn_extension_utils.h>
+#include "ie_memcpy.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
@@ -77,7 +77,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
THROW_IE_EXCEPTION << "Cannot get weights blob for node " << getName() << ".";
size_t weightsByteSize = blb->byteSize();
- memcpy(data, blb->buffer(), weightsByteSize);
+ ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
data += blb->size();
blb = scshLayer->_biases;
@@ -86,7 +86,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
} else {
if (weightsByteSize != blb->byteSize())
THROW_IE_EXCEPTION << "ScaleShift has incorrect weights!";
- memcpy(data, blb->buffer(), weightsByteSize);
+ ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
}
internalBlobs.push_back(internalBlob);
}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
index ef948b70f..c7d9d3e17 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
index 1da5d57f5..fd2893e95 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -59,16 +58,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
- InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
- auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
- precision = getCnnLayer()->outData[0]->getPrecision();
+ InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
+ auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision);
+ InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision();
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
- if (getCnnLayer()->precision == Precision::I8) {
- inputDataType = memory::data_type::u8;
- outputDataType = memory::data_type::u8;
- }
-
MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
@@ -103,6 +97,16 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
}
}
+ } else if (dims.ndims() == 5) {
+ if (dims[1] % 8 == 0) {
+ config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
+ supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
+
+ if (dims[1] % 16 == 0) {
+ config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
+ supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
+ }
+ }
}
if (axis != 1 || hasEltwise)
@@ -110,12 +114,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
auto numOfDim = static_cast<size_t>(dstDims.ndims());
- SizeVector order;
- SizeVector offsets;
+ SizeVector order(numOfDim);
+ SizeVector offsets(numOfDim, 0lu);
size_t offset = std::numeric_limits<size_t>::max();
for (size_t i = 0; i < numOfDim; i++) {
- order.push_back(i);
- offsets.push_back(0);
+ order[i] = i;
}
if (this->getCnnLayer()->precision == Precision::I8) {
@@ -135,7 +138,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
strides[i] = std::numeric_limits<size_t>::max();
}
- config.outConfs[0].desc = TensorDesc(Precision::U8, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
+ config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(),
+ dstDims.ToSizeVector(),
+ { blkDims, order, offset, offsets, strides });
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
@@ -144,7 +149,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NHWC in mkldnn
- config.inConfs[i].desc = TensorDesc(Precision::U8, parentEdge->getDims().ToSizeVector(),
+ config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(),
{blkDims, order, offset, offsets, strides});
}
@@ -174,26 +179,30 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
- if (numOfDim == 4) {
- order = {0, 1, 2, 3, 1};
- offsets = {0, 0, 0, 0, 0};
- numOfDim = 5;
+ if (numOfDim == 4lu || numOfDim == 5lu) {
+ size_t blkDimsLen = numOfDim + 1;
+ order.resize(blkDimsLen);
+ for (size_t i = 0; i < numOfDim; i++) {
+ order[i] = i;
+ }
+ order[numOfDim] = 1lu;
+ offsets = SizeVector(blkDimsLen, 0lu);
- // nChw8c and nChw16c
- for (int sizeS : {8, 16}) {
+ // nChw8c, nChw16c, nCdhw8c, nCdhw16c
+ for (size_t sizeS : {8lu, 16lu}) {
SizeVector blkDims = dstDims.ToSizeVector();
if (blkDims[1] % sizeS)
continue;
- blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+ blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
blkDims.push_back(sizeS);
- strides.resize(numOfDim);
- strides[numOfDim - 1] = 1;
- for (size_t i = 2; i <= numOfDim; i++) {
- if (numOfDim - i < axis) {
- strides[numOfDim - i] = std::numeric_limits<size_t>::max();
+ strides.resize(blkDimsLen);
+ strides[blkDimsLen - 1] = 1;
+ for (size_t i = 2lu; i <= blkDimsLen; i++) {
+ if (blkDimsLen - i < axis) {
+ strides[blkDimsLen - i] = std::numeric_limits<size_t>::max();
} else {
- strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1];
+ strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
}
}
config.outConfs[0].desc = TensorDesc(
@@ -201,13 +210,13 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
bool canInplace = true;
- for (size_t i = 0; canInplace && i < getParentEdges().size(); i++) {
+ for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
blkDims = parentEdge->getDims().ToSizeVector();
if (blkDims[1] % sizeS)
canInplace = false;
- blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+ blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
blkDims.push_back(sizeS);
config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
{blkDims, order, offset, offsets, strides});
@@ -225,11 +234,6 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
precision = getCnnLayer()->outData[0]->getPrecision();
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
- if (getCnnLayer()->precision == Precision::I8) {
- inputDataType = memory::data_type::u8;
- outputDataType = memory::data_type::u8;
- }
-
bool hasUnknown = false;
std::vector<size_t> canSelectPrimitive;
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
index 2b5fa898c..9aa51d7cd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
index 109a87fe5..ea1aee821 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -37,18 +36,18 @@ MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr&
wScale = ws->second;
}
-
// Trying to find oi-scale
- lastInInt8Chain = true;
- auto ois = layer->blobs.find("oi-scale");
- if (ois != layer->blobs.end()) {
- // If we can find an o-scale, then the next layer has to be an INT8.
- lastInInt8Chain = false;
- oScale = ois->second;
- } else {
- // If we can't find an oi-scale then the next layer has to be
- // an FP32, so we are the last layer in the INT8-chain
- lastInInt8Chain = true;
+ if (getCnnLayer()->type == "Convolution" && getCnnLayer()->precision == Precision::I8) {
+ auto ois = layer->blobs.find("oi-scale");
+ if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8)
+ && ois == layer->blobs.end()) {
+ THROW_IE_EXCEPTION << "Internal error of graph quantization - missmatch of intermediate scales and next layer type for convolution "
+ << getCnnLayer()->name;
+ }
+ if (ois != layer->blobs.end()) {
+ // If we can find an oi-scale, then the next layer has to be an INT8.
+ oScale = ois->second;
+ }
}
}
@@ -99,6 +98,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
groupOC /= groupNum;
}
+ weightDims.clear();
weightDims.push_back(groupOC);
weightDims.push_back(groupIC);
for (int i = 1; i <= convLayer->_kernel.size(); i++) {
@@ -141,13 +141,13 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
dilation.push_back(static_cast<int>(convLayer->_dilation[convLayer->_dilation.size() - i]) - 1);
}
- auto allPads = getConvPaddings(*convLayer);
+ auto allPads = getPaddings(*convLayer);
invertVectorCopyUtoI(allPads.begin, paddingL);
invertVectorCopyUtoI(allPads.end, paddingR);
MKLDNNDims weightsDims = MKLDNNDims(weightDims);
- for (int i = 0; i < 2; i++) {
+ for (int i = 0; i < paddingR.size(); i++) {
int with_group = (isGrouped || isMerged) ? 1 : 0;
int krn = weightsDims[with_group + 2 + i];
int src = getParentEdgeAt(0)->getDims()[2 + i];
@@ -176,26 +176,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
}
}
- if (weights->precision() == Precision::I8) {
- inputDataType = memory::u8;
- if (lastInInt8Chain) {
- outputDataType = memory::f32;
- } else {
- // Searching for the last fused node and taking the precision from there
- Precision p = getCnnLayer()->precision;
- if (fusedWith.size() > 0 && fusedWith[fusedWith.size() - 1]->getCnnLayer()->type == "ReLU") {
- p = fusedWith[fusedWith.size() - 1]->getCnnLayer()->precision;
- }
-
- if (p == Precision::I8) {
- outputDataType = memory::s8;
- } else if (p == Precision::U8) {
- outputDataType = memory::u8;
- } else {
- THROW_IE_EXCEPTION << "Invalid layer precision for " << getName();
- }
- }
-
+ if (this->getCnnLayer()->precision == Precision::I8) {
MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc);
MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc);
createDescriptor({in_candidate}, {out_candidate});
@@ -204,22 +185,48 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
inputDataType = memory::f32;
outputDataType = memory::f32;
- MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::nchw);
- MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::nchw);
- createDescriptor({in_candidate}, {out_candidate});
+ Layout layout = convLayer->input()->getLayout();
- if (IC == 3 || IC == 1) {
- out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
- createDescriptor({in_candidate}, {out_candidate});
- out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+ if (layout == NCHW || layout == NHWC) {
+ MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType,
+ layout == NCHW ? memory::nchw : memory::nhwc);
+ MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType,
+ layout == NCHW ? memory::nchw : memory::nhwc);
createDescriptor({in_candidate}, {out_candidate});
- } else {
- in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
- out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
- createDescriptor({in_candidate}, {out_candidate});
- in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
- out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+
+ if (IC == 3 || IC == 1) {
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+ createDescriptor({in_candidate}, {out_candidate});
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+ createDescriptor({in_candidate}, {out_candidate});
+ } else {
+ in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+ createDescriptor({in_candidate}, {out_candidate});
+ in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+ createDescriptor({in_candidate}, {out_candidate});
+ }
+ } else if (layout == NCDHW || layout == NDHWC) {
+ MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType,
+ layout == NCDHW ? memory::ncdhw : memory::ndhwc);
+ MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType,
+ layout == NCDHW ? memory::ncdhw : memory::ndhwc);
createDescriptor({in_candidate}, {out_candidate});
+
+ if (IC == 3 || IC == 1) {
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+ createDescriptor({in_candidate}, {out_candidate});
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+ createDescriptor({in_candidate}, {out_candidate});
+ } else {
+ in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+ createDescriptor({in_candidate}, {out_candidate});
+ in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
+ out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+ createDescriptor({in_candidate}, {out_candidate});
+ }
}
}
}
@@ -231,7 +238,15 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
for (auto &node : fusedWith) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
if (eltwiseNode) {
- ops.append_sum(1.0);
+ if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
+ auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
+ if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
+ // currently there is the only one scale while we need scale by channel :(
+ ops.append_sum(it->second->buffer().as<float*>()[0]);
+ }
+ } else {
+ ops.append_sum(1.0);
+ }
continue;
}
@@ -252,11 +267,10 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
- int bufferSize = depthwiseNode->isBroadcast() ? 1 : depthwiseDims[0];
PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
depthwiseLayer->_weights->buffer(),
- bufferSize * MKLDNNExtensionUtils::sizeOfDataType(
- memory::data_type::f32));
+ depthwiseLayer->_weights->size() *
+ MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
if (depthwiseNode->isBroadcast()) {
float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
@@ -271,9 +285,8 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
memory::format::x);
PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
depthwiseLayer->_biases->buffer(),
- bufferSize *
- MKLDNNExtensionUtils::sizeOfDataType(
- memory::data_type::f32));
+ depthwiseLayer->_biases->size() *
+ MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
if (depthwiseNode->isBroadcast()) {
float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
@@ -450,14 +463,15 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
bdt = memory::s32;
Precision outPrec;
- if (lastInInt8Chain) {
+ if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
outPrec = Precision::FP32;
} else {
// define precision accordninly normalizer
+ // TODO(amalyshe) do we need to have separate flow for last in int8 chain or not?
outPrec = outDesc.getPrecision();
}
- inDesc = TensorDesc(Precision::U8, inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
+ inDesc = TensorDesc(inDesc.getPrecision() , inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), outputDesc[0].getBlockingDesc());
}
@@ -502,8 +516,8 @@ void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr)
float* wScaleData = static_cast<float*>(wScale->buffer());
std::vector<float> oScaleDataVector;
- if (!lastInInt8Chain) {
- float* oScaleData = static_cast<float*>(oScale->buffer());
+ if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
+ float *oScaleData = static_cast<float *>(oScale->buffer());
for (size_t c = 0; c < wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
index aa2424194..19191ee45 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -57,8 +56,6 @@ private:
InferenceEngine::ConvolutionLayer* convLayer;
InferenceEngine::Blob::Ptr wScale, oScale;
-
- bool lastInInt8Chain;
};
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
index aafa4aec0..8b11c296f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
index 2895e81e8..f74ab297e 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
index 1295e05a5..38ca06ce8 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -11,6 +10,7 @@
#include <vector>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
+#include <ie_layers_internal.hpp>
#include "ie_parallel.hpp"
using namespace mkldnn;
@@ -67,18 +67,17 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
deconvLayer->_group,
deconvLayer->input()->getTensorDesc().getDims()[1] / deconvLayer->_group,
deconvLayer->_out_depth / deconvLayer->_group,
- deconvLayer->_kernel[Y_AXIS],
- deconvLayer->_kernel[X_AXIS]
};
groupNum = deconvLayer->_group;
} else {
weightDims = {
deconvLayer->input()->getTensorDesc().getDims()[1],
- deconvLayer->_out_depth,
- deconvLayer->_kernel[Y_AXIS],
- deconvLayer->_kernel[X_AXIS]
+ deconvLayer->_out_depth
};
}
+ for (int i = 1; i <= deconvLayer->_kernel.size(); i++) {
+ weightDims.push_back(deconvLayer->_kernel[deconvLayer->_kernel.size() - i]);
+ }
internalBlobs.push_back(createInternalBlob(weightDims, true));
@@ -86,12 +85,13 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
for (int i = 1; i <= deconvLayer->_dilation.size(); i++) {
dilation.push_back(static_cast<int>(deconvLayer->_dilation[deconvLayer->_dilation.size() - i]) - 1);
}
- invertVectorCopyUtoI(deconvLayer->_padding, paddingL);
- invertVectorCopyUtoI(deconvLayer->_pads_end, paddingR);
+ auto allPads = getPaddings(*deconvLayer);
+ invertVectorCopyUtoI(allPads.begin, paddingL);
+ invertVectorCopyUtoI(allPads.end, paddingR);
weightsDims = MKLDNNDims(weightDims);
- for (int i = 0; i < 2; i++) {
+ for (int i = 0; i < paddingR.size(); i++) {
int with_group = (withGroups) ? 1 : 0;
int krn = weightsDims[with_group + 2 + i];
int src = getChildEdgeAt(0)->getDims()[2 + i];
@@ -115,28 +115,46 @@ void MKLDNNDeconvolutionNode::execute(mkldnn::stream strm) {
}
if (withBiases) {
const auto *bias = biases->buffer().as<const float*>();
+ auto biasSize = biases->size();
auto dst = getChildEdgeAt(0)->getBlob();
float *output = dst->buffer().as<float *>() + dst->getTensorDesc().getBlockingDesc().getOffsetPadding();
+ auto dims_size = dst->getTensorDesc().getDims().size();
+ auto layout = dst->layout();
const size_t N = dst->getTensorDesc().getDims()[0];
- const size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum;
- const size_t H = dst->getTensorDesc().getDims()[2];
- const size_t W = dst->getTensorDesc().getDims()[3];
- const size_t blkC =
- dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4 ?
- dst->getTensorDesc().getBlockingDesc().getBlockDims()[4] :
- 1;
+ size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum;
+ if (C < 1) C = 1;
+ const size_t D = dims_size > 4 ? dst->getTensorDesc().getDims()[dims_size - 3] : 1lu;
+ const size_t H = dst->getTensorDesc().getDims()[dims_size - 2];
+ const size_t W = dst->getTensorDesc().getDims()[dims_size - 1];
+ size_t blkC = 1lu;
+ if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5) {
+ blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5 ?
+ dst->getTensorDesc().getBlockingDesc().getBlockDims()[5] :
+ 1lu;
+ } else if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4) {
+ blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims()[4];
+ }
auto strides = dst->getTensorDesc().getBlockingDesc().getStrides();
+ int output_size = strides[0] * N - dst->getTensorDesc().getBlockingDesc().getOffsetPadding();
- parallel_for4d(N, C, H, W, [&](size_t n, size_t c, size_t h, size_t w) {
+ parallel_for5d(N, C, D, H, W, [&](size_t n, size_t c, size_t d, size_t h, size_t w) {
for (size_t g = 0; g < groupNum; g++) {
- const size_t off = n * strides[0] + (g * C + c) * strides[1] + h * strides[2] + w * strides[3];
+ const size_t off = n * strides[0]
+ + (g * C + c) * strides[1]
+ + d * strides[dims_size - 3]
+ + h * strides[dims_size - 2]
+ + w * strides[dims_size - 1];
+ if (off >= output_size) continue;
auto o = &output[off];
+ int gcb = g * C * blkC + c * blkC;
for (int bc = 0; bc < blkC; ++bc) {
- o[bc] += bias[c * blkC + bc];
+ int index = gcb + bc;
+ if (index < biasSize)
+ o[bc] += bias[index];
}
}
});
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
index 244054c26..e32a66a73 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
index 8eadcf824..6b1097a62 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -39,9 +38,20 @@ void MKLDNNDepthwiseNode::getSupportedDescriptors() {
SizeVector weightDims = { (long unsigned int)parentOutDims[1] };
MKLDNNDims blocked_weightDims(weightDims);
+ auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get());
+ if (wLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << ".";
+
+ InferenceEngine::Blob::Ptr blb = wLayer->_weights;
+ if (blb)
+ realWeightSize = blb->size();
internalBlobs.push_back(createInternalBlob(weightDims, true));
- if (isWithBiases())
+ if (isWithBiases()) {
+ InferenceEngine::Blob::Ptr blb = wLayer->_biases;
+ if (blb)
+ realBiasSize = blb->size();
internalBlobs.push_back(createInternalBlob(weightDims, false));
+ }
for (auto format : getAvailableFormatsForDims(parentOutDims)) {
MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format};
@@ -66,13 +76,15 @@ void MKLDNNDepthwiseNode::createPrimitive() {
if (isBroadcast()) {
float broadcastValue = static_cast<float*>(internalBlobMemory[0]->GetData())[0];
- for (int i = 1; i < internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+ int blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
+ for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) {
static_cast<float*>(internalBlobMemory[0]->GetData())[i] = broadcastValue;
}
if (isWithBiases()) {
+ blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
broadcastValue = static_cast<float*>(internalBlobMemory[1]->GetData())[0];
- for (int i = 1; i < internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+ for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) {
static_cast<float*>(internalBlobMemory[1]->GetData())[i] = broadcastValue;
}
}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
index 78ef529f5..16bd3a505 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -50,6 +49,8 @@ private:
static Register<MKLDNNDepthwiseNode> reg;
mkldnn::algorithm algorithm;
+ size_t realWeightSize = 0;
+ size_t realBiasSize = 0;
bool withBiases;
bool broadcast;
};
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
index 0a051dc52..111196817 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -99,15 +98,9 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32);
supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, format));
} else {
- THROW_IE_EXCEPTION << "Invalid Eltwise layer precision";
+ THROW_IE_EXCEPTION << "Invalid Eltwise layer precision: " << getCnnLayer()->name;
}
}
-
- if (getCnnLayer()->precision == Precision::I8) {
- mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8);
- mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8);
- supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, mkldnn::memory::format::nhwc));
- }
}
void MKLDNNEltwiseNode::createPrimitive() {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
index e206799f4..0395cd432 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
index 20b60c62c..75b814e81 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -60,8 +59,11 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
} else if (inDims.ndims() == 4) {
weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
static_cast<size_t>(inDims[3])};
+ } else if (inDims.ndims() == 5) {
+ weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
+ static_cast<size_t>(inDims[3]), static_cast<size_t>(inDims[4])};
} else {
- THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 4 or 2, got: "
+ THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4 or 2, got: "
<< inDims.ndims() << " dims.";
}
@@ -113,10 +115,16 @@ memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::forma
return memory::format::oi;
case memory::format::nchw:
return memory::format::oihw;
+ case memory::format::ncdhw:
+ return memory::format::oidhw;
case memory::format::nChw8c:
return memory::format::oIhw8i;
+ case memory::format::nCdhw8c:
+ return memory::format::oIdhw8i;
case memory::format::nChw16c:
return memory::format::oIhw16i;
+ case memory::format::nCdhw16c:
+ return memory::format::oIdhw16i;
default:
THROW_IE_EXCEPTION << "Unsupported source format for node " << getName();
}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
index 88259a265..73c06f7ce 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
new file mode 100644
index 000000000..2874d9dfe
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
@@ -0,0 +1,234 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_gemm_node.h"
+#include <ie_layers.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <algorithm>
+#include <cmath>
+#include <mkldnn_types.h>
+#include <mkldnn_extension_utils.h>
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+MKLDNNGemmNode::MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
+
+void MKLDNNGemmNode::getSupportedDescriptors() {
+ auto* gemmLayer = dynamic_cast<GemmLayer*>(getCnnLayer().get());
+
+ if (gemmLayer == nullptr)
+ THROW_IE_EXCEPTION << "Cannot convert gemm layer.";
+
+ if (getParentEdges().size() != 2 && getParentEdges().size() != 3)
+ THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
+ if (getChildEdges().size() != 1)
+ THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
+
+ auto inDims0 = getParentEdgeAt(0)->getDims();
+ auto inDims1 = getParentEdgeAt(1)->getDims();
+ auto outDims = getChildEdgeAt(0)->getDims();
+
+ alpha = gemmLayer->alpha;
+ beta = gemmLayer->beta;
+ transposeA = gemmLayer->transpose_a;
+ transposeB = gemmLayer->transpose_b;
+
+ if ((inDims0.ndims() < 2 || inDims0.ndims() > 4) ||
+ (inDims1.ndims() < 2 || inDims1.ndims() > 4))
+ THROW_IE_EXCEPTION << "Unsupported input dims count for layer " << getName();
+
+ if (outDims.ndims() < 2 || outDims.ndims() > 4)
+ THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName();
+
+ if (inDims0.ndims() != inDims1.ndims() || inDims0.ndims() != outDims.ndims())
+ THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName();
+
+ int nDims = inDims0.ndims();
+ xAxis = nDims - 1;
+ yAxis = nDims - 2;
+
+ if (inDims0[xAxis] != inDims1[yAxis] || inDims0[yAxis] != outDims[yAxis] || inDims1[xAxis] != outDims[xAxis])
+ THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
+
+ isThreeInputs = getParentEdges().size() == 3;
+
+ if (isThreeInputs) {
+ auto inDims2 = getParentEdgeAt(2)->getDims();
+
+ if (inDims2.ndims() < 2 || inDims2.ndims() > 4)
+ THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName();
+
+ if (inDims2.ndims() != outDims.ndims())
+ THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName();
+
+ if (inDims2[yAxis] != outDims[yAxis] || inDims2[xAxis] != outDims[xAxis])
+ THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
+ }
+
+ for (int dim_idx = nDims - 3; dim_idx >= 0; dim_idx--) {
+ if (isThreeInputs) {
+ auto inDims2 = getParentEdgeAt(2)->getDims();
+
+ if (inDims2[dim_idx] != outDims[dim_idx] && inDims2[dim_idx] != 1)
+ THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName();
+
+ int cOffset = 1;
+ for (int i = dim_idx + 1; i < nDims; i++)
+ cOffset *= inDims2[i];
+ cOffsets.push_back(inDims2[dim_idx] == outDims[dim_idx] ? cOffset : 0);
+ }
+
+ if ((inDims0[dim_idx] != outDims[dim_idx] && inDims0[dim_idx] != 1) ||
+ (inDims1[dim_idx] != outDims[dim_idx] && inDims1[dim_idx] != 1)) {
+ THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName();
+ }
+
+ int aOffset = 1;
+ for (int i = dim_idx + 1; i < nDims; i++)
+ aOffset *= inDims0[i];
+ aOffsets.push_back(inDims0[dim_idx] == outDims[dim_idx] ? aOffset : 0);
+
+ int bOffset = 1;
+ for (int i = dim_idx + 1; i < nDims; i++)
+ bOffset *= inDims1[i];
+ bOffsets.push_back(inDims1[dim_idx] == outDims[dim_idx] ? bOffset : 0);
+ }
+
+ for (unsigned long dim_idx = aOffsets.size(); dim_idx < 2; dim_idx++)
+ aOffsets.push_back(0);
+ for (unsigned long dim_idx = bOffsets.size(); dim_idx < 2; dim_idx++)
+ bOffsets.push_back(0);
+ for (unsigned long dim_idx = cOffsets.size(); dim_idx < 2; dim_idx++)
+ cOffsets.push_back(0);
+}
+
+void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() {
+ if (!supportedPrimitiveDescriptors.empty())
+ return;
+
+ auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+ auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+
+ auto same = [&] (memory::format fmt) -> PrimitiveDescInfo {
+ InferenceEngine::LayerConfig config;
+ config.dynBatchSupport = true;
+ for (size_t i = 0; i < getParentEdges().size(); i++) {
+ InferenceEngine::DataConfig dataConfig;
+ dataConfig.inPlace = -1;
+ dataConfig.constant = false;
+ dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, fmt);
+ config.inConfs.push_back(dataConfig);
+ }
+
+ InferenceEngine::DataConfig dataConfig;
+ dataConfig.inPlace = -1;
+ dataConfig.constant = false;
+ dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
+ config.outConfs.push_back(dataConfig);
+ return {config, impl_desc_type::gemm_any};
+ };
+
+ supportedPrimitiveDescriptors.push_back(same(memory::any));
+}
+
+void MKLDNNGemmNode::createPrimitive() {
+ auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
+ auto& src0MemPtr = getParentEdgeAt(0)->getMemoryPtr();
+ auto& src1MemPtr = getParentEdgeAt(1)->getMemoryPtr();
+ if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
+ THROW_IE_EXCEPTION << "Destination memory isn't allocated.";
+ if (!src0MemPtr || !src0MemPtr->GetPrimitivePtr() || !src1MemPtr || !src1MemPtr->GetPrimitivePtr())
+ THROW_IE_EXCEPTION << "Input memory isn't allocated.";
+ if (getSelectedPrimitiveDescriptor() == nullptr)
+ THROW_IE_EXCEPTION << "Preferable primitive descriptor isn't set.";
+
+ if (isThreeInputs) {
+ auto& src2MemPtr = getParentEdgeAt(2)->getMemoryPtr();
+ if (!src2MemPtr || !src2MemPtr->GetPrimitivePtr())
+ THROW_IE_EXCEPTION << "Input memory isn't allocated.";
+ }
+}
+
+void MKLDNNGemmNode::execute(mkldnn::stream strm) {
+ auto inDims0 = getParentEdgeAt(0)->getDims();
+ auto inDims1 = getParentEdgeAt(1)->getDims();
+ auto outDims = getChildEdgeAt(0)->getDims();
+
+ auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
+ auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
+ const float *src0_ptr = reinterpret_cast<const float*>(srcMemory0.GetData()) +
+ srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
+ const float *src1_ptr = reinterpret_cast<const float*>(srcMemory1.GetData()) +
+ srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
+ float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
+ getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+ int MB1 = outDims.ndims() == 4 ? batchToProcess() : 1;
+ int MB2 = outDims.ndims() == 3 ? batchToProcess() : outDims.ndims() > 3 ? outDims[outDims.ndims() - 3] : 1;
+ int M = inDims0[yAxis];
+ int N = inDims1[xAxis];
+ int K = inDims0[xAxis];
+
+ const char transa = transposeA ? 'T' : 'N';
+ const char transb = transposeB ? 'T' : 'N';
+
+ int lda = transposeA ? M : K;
+ int ldb = transposeB ? K : N;
+ int ldc = N;
+
+ const float *src2_ptr;
+ if (isThreeInputs) {
+ auto& srcMemory2 = getParentEdgeAt(2)->getMemory();
+ src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetData()) +
+ srcMemory2.GetDescriptor().data.layout_desc.blocking.offset_padding;
+ } else {
+ src2_ptr = dst_ptr;
+ }
+
+ if (!isThreeInputs) {
+ beta = 0.f;
+ }
+
+ for (int b1 = 0; b1 < MB1; b1++) {
+ const float *a_ptr = src0_ptr;
+ const float *b_ptr = src1_ptr;
+ const float *c_ptr = src2_ptr;
+ float *d_ptr = dst_ptr;
+
+ for (int b2 = 0; b2 < MB2; b2++) {
+ if (isThreeInputs) {
+ memcpy(d_ptr, c_ptr, M * N * sizeof(float));
+ c_ptr += cOffsets[0];
+ }
+
+ mkldnn_sgemm(&transb, &transa, &N, &M, &K, &alpha, b_ptr, &ldb, a_ptr, &lda, &beta, d_ptr, &ldc);
+
+ a_ptr += aOffsets[0];
+ b_ptr += bOffsets[0];
+ d_ptr += M * N;
+ }
+
+ src0_ptr += aOffsets[1];
+ src1_ptr += bOffsets[1];
+ dst_ptr += MB2 * M * N;
+
+ if (isThreeInputs) {
+ src2_ptr += cOffsets[1];
+ }
+ }
+}
+
+bool MKLDNNGemmNode::created() const {
+ return getType() == Gemm;
+}
+
+int MKLDNNGemmNode::getMaxBatch() {
+ if (!outDims.empty())
+ return outDims[0][0];
+ return 0;
+}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h
new file mode 100644
index 000000000..da171a0da
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <vector>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNGemmNode : public MKLDNNNode {
+public:
+ MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng);
+ ~MKLDNNGemmNode() override = default;
+
+ void getSupportedDescriptors() override;
+ void initSupportedPrimitiveDescriptors() override;
+ void createPrimitive() override;
+ void execute(mkldnn::stream strm) override;
+ bool created() const override;
+ int getMaxBatch() override;
+
+private:
+ static Register<MKLDNNGemmNode> reg;
+ float alpha;
+ float beta;
+ bool transposeA;
+ bool transposeB;
+
+ int xAxis;
+ int yAxis;
+
+ bool isThreeInputs;
+
+ std::vector<int> aOffsets;
+ std::vector<int> bOffsets;
+ std::vector<int> cOffsets;
+};
+
+} // namespace MKLDNNPlugin
+
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
index 04cb400e1..b31b491e1 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -83,8 +82,7 @@ bool MKLDNNGenericNode::created(const MKLDNNExtensionManager::Ptr &extMgr) {
if (getCnnLayer() && extMgr) {
// We should save extension manager in otder to avoid situation when
// it will destroyed before extensibility primitives
- extensionManager = extMgr;
- extFactory.reset(extensionManager->CreateExtensionFactory(getCnnLayer()));
+ extFactory.reset(extMgr->CreateExtensionFactory(getCnnLayer()));
if (extFactory)
setType(Generic);
@@ -147,11 +145,6 @@ void MKLDNNGenericNode::execLayer() {
}
}
-MKLDNNGenericNode::~MKLDNNGenericNode() {
- extFactory.reset();
- extensionManager.reset();
-}
-
void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
InferenceEngine::LayerConfig rightConfig = config;
InferenceEngine::StatusCode rc;
@@ -206,11 +199,3 @@ void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &confi
constant = ConstantType::Const;
}
}
-
-void MKLDNNGenericNode::initOptimalPrimitiveDescriptor() {
- auto descriptor = getSelectedPrimitiveDescriptor();
- if (descriptor != nullptr) {
- auto config = descriptor->getConfig();
- initDescriptor(config);
- }
-}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
index 5cc8b0014..7bdd4a0f3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -17,7 +16,7 @@ namespace MKLDNNPlugin {
class MKLDNNGenericNode : public MKLDNNNode {
public:
MKLDNNGenericNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng);
- ~MKLDNNGenericNode() override;
+ ~MKLDNNGenericNode() = default;
void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
@@ -30,7 +29,6 @@ public:
}
void initDescriptor(const InferenceEngine::LayerConfig& config) override;
- void initOptimalPrimitiveDescriptor() override;
void execLayer();
void cleanup() override;
@@ -42,7 +40,6 @@ protected:
private:
static Register<MKLDNNGenericNode> reg;
- MKLDNNExtensionManager::Ptr extensionManager;
};
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
index 0a17a1442..9b42bee6b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
index 134ce8f61..99b4c8657 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
index 32594e315..4b1192b85 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
index b2a5c1829..9d85dabd3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
index b60177cb2..a37a2530b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
index 53ab16c39..ebc67748f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
index aa395a130..c23ce6ee5 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -69,6 +68,21 @@ void MKLDNNPermuteNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
}
+ } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
+ config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ncdhw);
+ config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ncdhw);
+ supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+
+ auto srcDims = getParentEdgeAt(0)->getDims();
+ if (srcDims[1] % 8 == 0) {
+ config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
+ supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+ }
+
+ if (srcDims[1] % 16 == 0) {
+ config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
+ supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+ }
} else {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::any);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
@@ -221,6 +235,70 @@ static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
}
+static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+ auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
+ auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
+ src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+ dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+ const int C = srcMemPtr->GetDims()[1];
+ const int S = srcMemPtr->GetDims()[2];
+
+ parallel_for2d(MB, S, [&](int n, int s) {
+ int src_off = 0;
+ int dst_off = 0;
+
+ for (int c = 0; c < C; c++) {
+ src_off = n * C * S +
+ c * S +
+ s;
+ dst_off = n * S * C +
+ s * C +
+ c;
+
+ dst_data[dst_off] = src_data[src_off];
+ }
+ });
+}
+
+static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+ auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
+ auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
+ src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+ dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+ const int DIM1 = srcMemPtr->GetDims()[1];
+ const int DIM2 = srcMemPtr->GetDims()[2];
+ const int DIM3 = srcMemPtr->GetDims()[3];
+ const int DIM4 = srcMemPtr->GetDims()[4];
+ const int DIM5 = srcMemPtr->GetDims()[5];
+
+ int src_off = 0;
+ int dst_off = 0;
+
+ for (int n = 0; n < MB; n++) {
+ for (int dim3 = 0; dim3 < DIM3; dim3++) {
+ for (int dim4 = 0; dim4 < DIM4; dim4++) {
+ for (int dim1 = 0; dim1 < DIM1; dim1++) {
+ for (int dim5 = 0; dim5 < DIM5; dim5++) {
+ for (int dim2 = 0; dim2 < DIM2; dim2++) {
+ src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 +
+ dim1 * DIM2 * DIM3 * DIM4 * DIM5 +
+ dim2 * DIM3 * DIM4 * DIM5 +
+ dim3 * DIM4 * DIM5 +
+ dim4 * DIM5 +
+ dim5;
+
+ dst_data[dst_off] = src_data[src_off];
+ dst_off++;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPermuteNode::OptimizedCases = {
{{0, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_0231, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return true;
@@ -237,6 +315,12 @@ std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPerm
{{0, 2, 1, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_0213, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
})}, // shufflenet
+ {{0, 2, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_021, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+ return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+ })}, // self attention block
+ {{0, 3, 4, 1, 5, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_034152, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+ return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+ })}, // learning-to-see-in-the-dark-sony
};
void MKLDNNPermuteNode::execute(mkldnn::stream strm) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
index 5b69b4475..9c0ce0d49 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
index 0ec7c0a26..82e3eac50 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -11,6 +10,7 @@
#include <vector>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
+#include <ie_layers_internal.hpp>
using namespace mkldnn;
using namespace MKLDNNPlugin;
@@ -23,12 +23,8 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
return;
InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
precision = getCnnLayer()->outData[0]->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
auto * poolingLayer = dynamic_cast<PoolingLayer*>(getCnnLayer().get());
@@ -45,15 +41,16 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
invertVectorCopyUtoI(poolingLayer->_stride, stride);
invertVectorCopyUtoI(poolingLayer->_kernel, kernel);
- invertVectorCopyUtoI(poolingLayer->_padding, paddingL);
- invertVectorCopyUtoI(poolingLayer->_pads_end, paddingR);
+ auto allPads = getPaddings(*poolingLayer);
+ invertVectorCopyUtoI(allPads.begin, paddingL);
+ invertVectorCopyUtoI(allPads.end, paddingR);
auto parentDims = getParentEdgeAt(0)->getDims();
auto childDims = getChildEdgeAt(0)->getDims();
if ((parentDims.ndims() < 4) || (parentDims.ndims() > 5))
THROW_IE_EXCEPTION << "Pooling layer. Unsupported mode. Only 4D and 5D blobs are supported as input.";
- for (int i = 0; i < 2; i++) {
+ for (int i = 0; i < paddingR.size(); i++) {
int krn = kernel[i];
int src = getParentEdgeAt(0)->getDims()[2 + i];
int dst = getChildEdgeAt(0)->getDims()[2 + i];
@@ -61,11 +58,11 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
int calc_dst = (src - krn + paddingL[i]) / stride[i] + 1;
paddingR[i] = (dst - calc_dst) * stride[i];
}
-
if (this->getCnnLayer()->precision == Precision::I8) {
- MKLDNNMemoryDesc in_candidate{parentDims, memory::data_type::u8, memory::format::nhwc};
- MKLDNNMemoryDesc out_candidate{childDims, memory::data_type::u8, memory::format::nhwc};
- createDescriptor({in_candidate}, {out_candidate});
+ // i8 layers supports only nhwc layout
+ MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, memory::format::nhwc};
+ MKLDNNMemoryDesc out_candidate{childDims, outputDataType, memory::format::nhwc};
+ createDescriptor({ in_candidate }, { out_candidate });
} else {
// It doesn't support any format
for (auto format : getAvailableFormatsForDims(parentDims)) {
@@ -97,7 +94,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
algorithm alg;
if (type == PoolingLayer::PoolType::AVG) {
- if (!exclude_pad && (paddingL[0] != 0 || paddingL[1] != 0))
+ bool not_zero_l = false;
+ for (auto lr : paddingL) {
+ if (lr) {
+ not_zero_l = true;
+ break;
+ }
+ }
+ if (!exclude_pad && not_zero_l)
alg = pooling_avg_include_padding;
else
alg = pooling_avg_exclude_padding;
@@ -114,7 +118,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
stride, kernel, paddingL, paddingR,
mkldnn::padding_kind::zero));
- if (alg == pooling_avg_include_padding && (paddingR[0] || paddingR[1])) {
+ bool not_zero_r = false;
+ for (auto pr : paddingR) {
+ if (pr) {
+ not_zero_r = true;
+ break;
+ }
+ }
+ if (alg == pooling_avg_include_padding && not_zero_r) {
// In case of AVG including paddings the norm coeff should be calculated
// with tacking into account original pads. So we need to restore
// original values (R_padding = L_padding).
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
index 0af8a8ae5..e5309f494 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
index 360f3459c..01ae0e6fd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
index 370d694d9..a6fce5cbd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
index 3b1678079..345b21536 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -71,6 +70,17 @@ void MKLDNNReorderNode::createPrimitive() {
if (getSelectedPrimitiveDescriptor() == nullptr)
THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
+ createReorderPrimitive(srcMemPtr->GetDescriptor(), srcMemPtr->GetPrimitive().get_data_handle(),
+ dstMemPtr->GetDescriptor(), dstMemPtr->GetPrimitive().get_data_handle());
+}
+
+void MKLDNNReorderNode::createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr) {
+ src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
+ src_blocked->Create(srcDesc, srcPtr);
+
+ dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
+ dst_blocked->Create(dstDesc, dstPtr);
+
mkldnn::primitive_attr attr;
if (_scales) {
@@ -90,52 +100,12 @@ void MKLDNNReorderNode::createPrimitive() {
attr.set_int_output_round_mode(round_nearest);
}
- if (srcMemPtr->GetSize() == dstMemPtr->GetSize()) {
- InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision();
- InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision();
-
- if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) {
- // This reorder actually does nothing so we declare it in-place.
- dstMemPtr->GetPrimitive().set_data_handle(srcMemPtr->GetPrimitive().get_data_handle());
- } else {
- try {
- // No autoblocking. Reorder can be applied as is
-
- reorder::primitive_desc pd = reorder::primitive_desc(srcMemPtr->GetPrimitiveDescriptor(), dstMemPtr->GetPrimitiveDescriptor(), attr);
- prim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), dstMemPtr->GetPrimitive()));
- } catch (...) {}
- }
- } else {
- // Autoblocking case. nchw<=>nChw8c are only supported, but memory descriptor
- // should be with strides. Prepare it from enlarged blob
- memory::dims dims = srcMemPtr->GetDims();
- memory::dims dims_dst = dstMemPtr->GetDims();
-
- for (int i = 0; i < dims.size(); i++) // min dims is a logical dims
- dims[i] = std::min(dims[i], dims_dst[i]);
-
- memory::desc src_d = srcMemPtr->GetDescriptor();
- void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-
- memory::desc dst_d = dstMemPtr->GetDescriptor();
- void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-
- for (int i = 0; i < dims.size(); i++)
- src_d.data.dims[i] = dst_d.data.dims[i] = dims[i];
-
- src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
- src_blocked->Create(src_d, src_data_hdl);
-
- dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
- dst_blocked->Create(dst_d, dst_data_hdl);
-
- // output blob should be zeroed. NaN value can occur in untouched place.
- dstMemPtr->FillZero();
-
+ try {
+ // No autoblocking. Reorder can be applied as is
reorder::primitive_desc pd = reorder::primitive_desc(src_blocked->GetPrimitiveDescriptor(), dst_blocked->GetPrimitiveDescriptor(), attr);
prim.reset(new mkldnn::reorder(pd, src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
- }
+ } catch (...) {}
}
const std::vector<impl_desc_type>& MKLDNNReorderNode::getPrimitivesPriority() {
@@ -148,32 +118,9 @@ bool MKLDNNReorderNode::created() const {
}
void MKLDNNReorderNode::execute(mkldnn::stream strm) {
- if (prim) {
- if (src_blocked)
- src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
- if (dst_blocked)
- dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
- MKLDNNNode::execute(strm);
- } else {
- InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision();
- InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision();
- if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) {
- // Do nothing here
- } else {
- auto srcBlbPtr = getParentEdgeAt(0)->getBlob();
- auto dstBlbPtr = getChildEdgeAt(0)->getBlob();
-
- assert(srcBlbPtr->size() == dstBlbPtr->size());
- int data_size = srcBlbPtr->size();
-
- const auto* src_data = srcBlbPtr->cbuffer().as<const float *>();
- auto* dst_data = dstBlbPtr->buffer().as<float *>();
-
- InferenceEngine::parallel_for(data_size, [&](int i) {
- dst_data[dstBlbPtr->getTensorDesc().offset(i)] = src_data[srcBlbPtr->getTensorDesc().offset(i)];
- });
- }
- }
+ src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
+ dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
+ MKLDNNNode::execute(strm);
}
void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
@@ -186,21 +133,12 @@ void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
- if (src_blocked && dst_blocked) {
- src_d = src_blocked->GetDescriptor();
- dst_d = dst_blocked->GetDescriptor();
- src_data_hdl = src_blocked->GetPrimitive().get_data_handle();
- dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle();
- }
- src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
src_d.data.dims[0] = batchToProcess();
src_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
- src_blocked->Create(src_d, src_data_hdl);
- dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
dst_d.data.dims[0] = batchToProcess();
dst_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
- dst_blocked->Create(dst_d, dst_data_hdl);
- prim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
+
+ createReorderPrimitive(src_d, src_data_hdl, dst_d, dst_data_hdl);
}
}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
index 3d74c2000..7a228ecec 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -51,6 +50,8 @@ private:
MKLDNNMemoryPtr dst_blocked;
MKLDNNMemoryPtr src_blocked;
+
+ void createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr);
};
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
index cfd51bf36..d959aa5f9 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -49,15 +48,6 @@ void MKLDNNReshapeNode::initSupportedPrimitiveDescriptors() {
config.outConfs[0].constant = false;
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
- if (inDims.ndims() == 4 && inDims[1] % 8 == 0 && outDims.ndims() == 4 &&outDims[1] % 8 == 0) {
- outFormat = memory::format::any;
- }
- config.inConfs[0].inPlace = -1;
- config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any);
- config.outConfs[0].inPlace = -1;
- config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
-
- supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
}
void MKLDNNReshapeNode::createPrimitive() {
@@ -69,107 +59,6 @@ void MKLDNNReshapeNode::createPrimitive() {
THROW_IE_EXCEPTION << "Input memory didn't allocate.";
if (getSelectedPrimitiveDescriptor() == nullptr)
THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
-
- if (srcMemPtr->GetData() != dstMemPtr->GetData()) {
- InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
- auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
- precision = getCnnLayer()->outData[0]->getPrecision();
- if (precision != InferenceEngine::Precision::FP32)
- precision = InferenceEngine::Precision::FP32;
- auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
- auto dims = getParentEdgeAt(0)->getDims();
-
- srcMem.reset(new MKLDNNMemory(getEngine()));
- srcMem->Create(dims, inputDataType, MKLDNNMemory::GetPlainFormat(dims));
-
- dstMem.reset(new MKLDNNMemory(getEngine()));
- dstMem->Create(getChildEdgeAt(0)->getDims(), outputDataType,
- MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()), srcMem->GetData());
-
- if (srcMemPtr->GetSize() == srcMem->GetSize()) {
- srcPrim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), srcMem->GetPrimitive()));
- } else {
- // Autoblocking mode
- memory::dims dims = srcMem->GetDims(); // contains logical dims
-
- memory::desc src_d = srcMemPtr->GetDescriptor();
- void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-
- for (int i = 0; i < dims.size(); i++)
- src_d.data.dims[i] = dims[i];
-
- memory::primitive_desc tmp_src_pd(src_d, getEngine());
- src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
- src_blocked->Create(src_d, src_data_hdl);
-
- srcPrim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), srcMem->GetPrimitive()));
- }
-
- if (dstMemPtr->GetSize() == dstMem->GetSize()) {
- dstPrim.reset(new mkldnn::reorder(dstMem->GetPrimitive(), dstMemPtr->GetPrimitive()));
- } else {
- // Autoblocking mode
- memory::dims dims = srcMem->GetDims();
-
- memory::desc dst_d = dstMemPtr->GetDescriptor();
- void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-
- for (int i = 0; i < dims.size(); i++)
- dst_d.data.dims[i] = dims[i];
-
- dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
- dst_blocked->Create(dst_d, dst_data_hdl);
-
- dstPrim.reset(new mkldnn::reorder(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive()));
- }
- }
-}
-
-void MKLDNNReshapeNode::setDynamicBatchLim(int lim) {
- dynBatchLim = lim;
- if (srcPrim && dstPrim) {
- auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
- auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
- memory::desc src_d = srcMemPtr->GetDescriptor();
- memory::desc dst_d = dstMemPtr->GetDescriptor();
- void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
- void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
- srcMem = std::make_shared<MKLDNNMemory>(getEngine());
- src_d.data.dims[0] = batchToProcess();
- srcMem->Create(src_d, src_data_hdl);
- dstMemPtr = std::make_shared<MKLDNNMemory>(getEngine());
- src_d.data.dims[0] = batchToProcess();
- dstMemPtr->Create(src_d, src_data_hdl);
-
- if (src_blocked && dst_blocked) {
- src_d = src_blocked->GetDescriptor();
- dst_d = dst_blocked->GetDescriptor();
- src_data_hdl = src_blocked->GetPrimitive().get_data_handle();
- dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle();
- }
- src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
- src_d.data.dims[0] = batchToProcess();
- src_blocked->Create(src_d, src_data_hdl);
-
- dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
- dst_d.data.dims[0] = batchToProcess();
- dst_blocked->Create(dst_d, dst_data_hdl);
- srcPrim = std::make_shared<mkldnn::reorder>(src_blocked->GetPrimitive(), srcMem->GetPrimitive());
- dstPrim = std::make_shared<mkldnn::reorder>(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive());
- }
-}
-
-void MKLDNNReshapeNode::execute(mkldnn::stream strm) {
- if (srcPrim && dstPrim) {
- if (src_blocked)
- src_blocked->GetPrimitive().set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
- if (dst_blocked)
- dst_blocked->GetPrimitive().set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
- strm.submit({*srcPrim, *dstPrim});
- }
}
bool MKLDNNReshapeNode::created() const {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
index eeb666008..bb30099c9 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -21,19 +20,10 @@ public:
void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void createPrimitive() override;
- void execute(mkldnn::stream strm) override;
bool created() const override;
- void setDynamicBatchLim(int lim) override;
private:
static Register<MKLDNNReshapeNode> reg;
- std::shared_ptr<mkldnn::primitive> srcPrim;
- std::shared_ptr<mkldnn::primitive> dstPrim;
- MKLDNNMemoryPtr srcMem;
- MKLDNNMemoryPtr dstMem;
-
- MKLDNNMemoryPtr dst_blocked;
- MKLDNNMemoryPtr src_blocked;
};
} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
index a474ca926..ba3228543 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
@@ -1,12 +1,11 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_rnn.h"
#include "mkldnn_extension_utils.h"
#include "desc_iterator.hpp"
-#include <ie_layers.h>
+#include <ie_layers_prv.h>
#include <string>
#include <utility>
@@ -16,39 +15,143 @@ using namespace InferenceEngine;
namespace MKLDNNPlugin {
-MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
+template <typename T, typename P>
+inline bool one_of(T val, P item) { return val == item; }
+template <typename T, typename P, typename... Args>
+inline bool one_of(T val, P item, Args... item_others) {
+ return val == item || one_of(val, item_others...);
+}
+
+rnn_direction ie2mkl(RNNLayer::Direction &direction) {
+ return direction == RNNLayer::RNN_FWD ? unidirectional_left2right
+ : direction == RNNLayer::RNN_BWD ? unidirectional_right2left
+ : direction == RNNLayer::RNN_BDR ? bidirectional_concat
+ : unidirectional;
+}
+
+MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {
+ is_cell = layer->type == "LSTMCell";
+}
bool MKLDNNRNN::created() const {
- return getType() == RNN;
+ return getType() == (is_cell ? LSTMCell : RNN);
}
void MKLDNNRNN::getSupportedDescriptors() {
+ if (is_cell)
+ fillCellDesc();
+ else
+ fillSeqDesc();
+}
+
+void MKLDNNRNN::fillCellDesc() {
+ if (!descs.empty()) return;
+ auto cellLayer = std::dynamic_pointer_cast<InferenceEngine::LSTMCell>(getCnnLayer());
+
+ if (!cellLayer)
+ THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer.";
+
+ auto &ins = cellLayer->insData;
+ auto &outs = cellLayer->outData;
+
+ if (ins.size() != 3)
+ THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName();
+ if (outs.size() != 2)
+ THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName();
+
+ auto in_data_dims = getParentEdgeAt(0)->getDims();
+ auto in_h_state_dims = getParentEdgeAt(1)->getDims();
+ auto in_c_state_dims = getParentEdgeAt(2)->getDims();
+
+ auto out_h_state_dims = getChildEdgeAt(0)->getDims();
+ auto out_c_state_dims = getChildEdgeAt(1)->getDims();
+
+ if (in_data_dims.ndims() != 2
+ || in_h_state_dims.ndims() != 2
+ || in_c_state_dims.ndims() != 2
+ || out_h_state_dims.ndims() != 2
+ || out_c_state_dims.ndims() != 2)
+ THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+
+ T = 1;
+ N = in_data_dims[0];
+ DC = in_data_dims[1];
+ SC = in_h_state_dims[1];
+
+ // Expected shapes
+ MKLDNNDims D_shape {N, DC}, S_shape {N, SC};
+
+ if (in_data_dims != D_shape
+ || in_h_state_dims != S_shape
+ || in_c_state_dims != S_shape
+ || out_h_state_dims != S_shape
+ || out_c_state_dims != S_shape)
+ THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+
+ auto blobs = cellLayer->blobs;
+ Blob::Ptr weights, bias;
+ if (blobs.find("weights") != blobs.end()) weights = blobs["weights"];
+ if (blobs.find("biases") != blobs.end()) bias = blobs["biases"];
+
+ if (!weights)
+ THROW_IE_EXCEPTION << "RNN Layer. Weights do not present.";
+
+ if (weights->size() != G*SC*(SC+DC))
+ THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);
+
+ if (bias && bias->size() != G*SC)
+ THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
+
+ // Shapes and Attributes are correct. Can start internal stuff initialization.
+
+ in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+ out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+
+ in_data_d = {{T, N, DC}, memory::f32, memory::tnc};;
+ out_data_d = {{T, N, SC}, memory::f32, memory::tnc};;
+
+ w_data_d = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
+ w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
+
+ if (bias)
+ w_bias_d = {{L, D, G, SC}, memory::f32, memory::ldgo};
+
+ std::vector<TensorDesc> in_candidate;
+ in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc});
+ in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+ in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+
+ std::vector<TensorDesc> out_candidate;
+ out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+ out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+
+ createDescriptor(in_candidate, out_candidate);
+}
+
+void MKLDNNRNN::fillSeqDesc() {
if (!descs.empty()) return;
auto rnnLayer = std::dynamic_pointer_cast<RNNLayer>(getCnnLayer());
if (!rnnLayer)
THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer.";
- if (rnnLayer->cellType == LSTM)
- cellr_type = LSTM;
- else
+ if (!one_of(rnnLayer->cellType, "LSTM"))
THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell";
- swap_state = rnnLayer->params["swap_state"] == "YES";
+ if (!one_of(rnnLayer->axis, 0, 1))
+ THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1";
+ nativeOrder = rnnLayer->axis == 0;
- if (rnnLayer->_axis == 0)
- nativeOrder = true;
- else if (rnnLayer->_axis == 1)
- nativeOrder = false;
- else
- THROW_IE_EXCEPTION << "RNN layer supports only sequence axis == 1";
+ if (!one_of(rnnLayer->direction, RNNLayer::RNN_FWD, RNNLayer::RNN_BWD))
+ THROW_IE_EXCEPTION << "RNN layer supports only unidirectional RNN layer";
+ direction = ie2mkl(rnnLayer->direction);
auto &ins = rnnLayer->insData;
auto &outs = rnnLayer->outData;
- if (ins.size() != 3 && ins.size() != 1)
+ if (!one_of(ins.size(), 3, 1))
THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName();
- if (outs.size() != 3 && outs.size() !=1)
+ if (!one_of(outs.size(), 3, 1))
THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName();
auto in_data_dims = getParentEdgeAt(0)->getDims();
@@ -62,31 +165,21 @@ void MKLDNNRNN::getSupportedDescriptors() {
std::swap(out_data_dims[0], out_data_dims[1]);
}
- // IE specific order
- seq = in_data_dims[0];
- batch = in_data_dims[1];
- data_len = in_data_dims[2];
- state_len = out_data_dims[2];
-
- const int N = batch;
- const int T = seq;
- const int G = num_gates;
- const int DC = data_len;
- const int SC = state_len;
- const int L = 1; // What is a L ??
- const int D = 1;
- const int S = 2;
-
- if (out_data_dims != MKLDNNDims {T, N, SC})
- THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+ T = in_data_dims[0];
+ N = in_data_dims[1];
+ DC = in_data_dims[2];
+ SC = out_data_dims[2];
- MKLDNNDims state_dims {batch, state_len};
+ MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC};
+
+ if (out_data_dims != OD_shape)
+ THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
if (ins.size() == 3) {
auto state_dims1 = getParentEdgeAt(1)->getDims();
auto stats_dims2 = getParentEdgeAt(2)->getDims();
- if (state_dims1 != state_dims || stats_dims2 != state_dims)
+ if (state_dims1 != S_shape || stats_dims2 != S_shape)
THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
@@ -96,7 +189,7 @@ void MKLDNNRNN::getSupportedDescriptors() {
auto state_dims1 = getChildEdgeAt(1)->getDims();
auto stats_dims2 = getChildEdgeAt(2)->getDims();
- if (state_dims1 != state_dims || stats_dims2 != state_dims)
+ if (state_dims1 != S_shape || stats_dims2 != S_shape)
THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
@@ -133,8 +226,8 @@ void MKLDNNRNN::getSupportedDescriptors() {
in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc});
if (ins.size() == 3) {
- in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
- in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
+ in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+ in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
}
std::vector<TensorDesc> out_candidate;
@@ -144,8 +237,8 @@ void MKLDNNRNN::getSupportedDescriptors() {
out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc});
if (outs.size() == 3) {
- out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
- out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
+ out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+ out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
}
createDescriptor(in_candidate, out_candidate);
@@ -156,7 +249,7 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
MKLDNNDescriptor desc(std::shared_ptr<rnn_forward::desc>(
new rnn_forward::desc(forward_scoring,
{algorithm::vanilla_lstm, algorithm::eltwise_tanh },
- unidirectional,
+ direction,
/* In Data */ in_data_d,
/* In State */ in_state_d,
/* Weights data */ w_data_d,
@@ -194,13 +287,8 @@ void MKLDNNRNN::createPrimitive() {
std::shared_ptr<rnn_forward::desc> d = descs[0];
rnn_forward::primitive_desc pd(*d, getEngine());
- auto src_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
- src_data_mem->Create(in_data_d, getParentEdgeAt(0)->getMemoryPtr()->GetData());
- internalBlobMemory.push_back(src_data_mem);
-
- auto dst_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
- dst_data_mem->Create(out_data_d, getChildEdgeAt(0)->getMemoryPtr()->GetData());
- internalBlobMemory.push_back(dst_data_mem);
+ auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
+ auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
// create weight blobs (data and state part)
auto w_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
@@ -229,28 +317,27 @@ void MKLDNNRNN::createPrimitive() {
*
* Gate order
* Caffe - IFOC, ONNX - IOFC
- * IE - FICO, mkldnn - FIOC
- *
+ * IE - FICO, mkldnn - IFCO
*/
- // FICO -> FIOC
- const int gate_map[] = {0, 1, 3, 2};
+ // FICO -> IFCO
+ const int gate_map[] = {1, 0, 2, 3};
auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as<const float*>();
auto w_ptr = static_cast<float*>(w_data_mem->GetData());
auto r_ptr = static_cast<float*>(w_state_mem->GetData());
- const int step = state_len * num_gates;
+ const int step = SC * G;
- for (int g = 0; g < num_gates; g++) {
- for (int out_i = 0; out_i < state_len; out_i++) {
- float *l_w_ptr = w_ptr + gate_map[g]*state_len + out_i;
- float *l_r_ptr = r_ptr + gate_map[g]*state_len + out_i;
- for (int in_i = 0; in_i < data_len; in_i++) {
+ for (int g = 0; g < G; g++) {
+ for (int out_i = 0; out_i < SC; out_i++) {
+ float *l_w_ptr = w_ptr + gate_map[g]*SC + out_i;
+ float *l_r_ptr = r_ptr + gate_map[g]*SC+ out_i;
+ for (int in_i = 0; in_i < DC; in_i++) {
*l_w_ptr = *ie_w_ptr;
ie_w_ptr++;
l_w_ptr += step;
}
- for (int in_i = 0; in_i < state_len; in_i++) {
+ for (int in_i = 0; in_i < SC; in_i++) {
*l_r_ptr = *ie_w_ptr;
ie_w_ptr++;
l_r_ptr += step;
@@ -261,9 +348,9 @@ void MKLDNNRNN::createPrimitive() {
if (w_bias_d) {
auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as<const float*>();
auto b_ptr = static_cast<float*>(w_bias_mem->GetData());
- for (int g = 0; g < num_gates; g++) {
- float *l_b_ptr = b_ptr + gate_map[g]*state_len;
- for (int out_i = 0; out_i < state_len; out_i++) {
+ for (int g = 0; g < G; g++) {
+ float *l_b_ptr = b_ptr + gate_map[g]*SC;
+ for (int out_i = 0; out_i < SC; out_i++) {
*l_b_ptr = *ie_b_ptr;
ie_b_ptr++;
l_b_ptr++;
@@ -293,37 +380,35 @@ void MKLDNNRNN::createPrimitive() {
src_stat_1.get_primitive_desc().get_size());
internalBlobMemory.push_back(high_half_state_mem);
- if (!swap_state) {
- exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive());
- exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive());
- } else {
- exec_before.emplace_back(src_stat_2, low_half_state_mem->GetPrimitive());
- exec_before.emplace_back(src_stat_1, high_half_state_mem->GetPrimitive());
- }
+ exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive());
+ exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive());
}
auto dst_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
dst_state_mem->Create(out_state_d);
internalBlobMemory.push_back(dst_state_mem);
if (out_state_d) {
+ int idx_H = is_cell ? 0 : 1;
+ int idx_C = is_cell ? 1 : 2;
/* create copy/split primitive */
- auto dst_stat_1 = getChildEdgeAt(1)->getMemory().GetPrimitive();
- auto dst_stat_2 = getChildEdgeAt(2)->getMemory().GetPrimitive();
+ auto dst_stat_1 = getChildEdgeAt(idx_H)->getMemory().GetPrimitive();
+ auto dst_stat_2 = getChildEdgeAt(idx_C)->getMemory().GetPrimitive();
auto low_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
low_half_state_mem->Create(
dst_stat_1.get_primitive_desc().desc(),
- src_state_mem->GetPrimitive().get_data_handle());
+ dst_state_mem->GetPrimitive().get_data_handle());
internalBlobMemory.push_back(low_half_state_mem);
auto high_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
high_half_state_mem->Create(
dst_stat_2.get_primitive_desc().desc(),
- static_cast<uint8_t*>(src_state_mem->GetPrimitive().get_data_handle()) +
+ static_cast<uint8_t*>(dst_state_mem->GetPrimitive().get_data_handle()) +
dst_stat_1.get_primitive_desc().get_size());
internalBlobMemory.push_back(high_half_state_mem);
- exec_after.emplace_back(low_half_state_mem->GetPrimitive(), dst_stat_1);
+
+ if (!is_cell) exec_after.emplace_back(low_half_state_mem->GetPrimitive(), dst_stat_1);
exec_after.emplace_back(high_half_state_mem->GetPrimitive(), dst_stat_2);
}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
index a47fdf41c..4399c306a 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -28,18 +27,30 @@ public:
void execute(mkldnn::stream strm) override;
private:
+ void fillCellDesc();
+ void fillSeqDesc();
+
+private:
static Register<MKLDNNRNN> reg;
- InferenceEngine::CellType cellr_type = InferenceEngine::CellType::LSTM;
+ /** Specify mode Cell or Seq. true - Cell, false - Seq */
+ bool is_cell = false;
+
/** Native order if [batch, seq, data], other case is [seq, batch, data] */
bool nativeOrder = true;
- bool swap_state = false;
- int batch = 0;
- int seq = 0;
- int data_len = 0;
- int state_len = 0;
- const size_t num_gates = 4;
+ /** Direction of iteration through sequence dimension */
+ mkldnn::rnn_direction direction = mkldnn::unidirectional;
+
+ // Internal attributes
+ int N = 0; /**< Batch value */
+ int T = 0; /**< Sequence value */
+ int DC = 0; /**< Input data channel size */
+ int SC = 0; /**< State channel size value */
+ const int G = 4; /**< Gate size. 4 for LSTM */
+ const int L = 1; /**< What is it??. Constant for mkldnn impl */
+ const int D = 1; /**< Num of direction. 1 or 2 */
+ const int S = 2; /**< Num of state. 2 for LSTM (hidden and sell state). */
MKLDNNMemoryDesc in_data_d;
MKLDNNMemoryDesc out_data_d;
@@ -51,6 +62,7 @@ private:
MKLDNNMemoryDesc w_state_d;
MKLDNNMemoryDesc w_bias_d;
+ // List of in/out reorders if required
std::vector<mkldnn::reorder> exec_before;
std::vector<mkldnn::reorder> exec_after;
};
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
index 7d76243f9..4088a1f7a 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
index 401a1c7d3..ca2bafd4f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
index 0738f0054..752172733 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
index 792a634c9..8e199f377 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
index 618479c22..90cf4f401 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -24,16 +23,15 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
if (splitLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert split layer.";
- axis = splitLayer->_axis;
-
- if (axis != 1)
- THROW_IE_EXCEPTION << "Split support only axis 1.";
-
if (getParentEdges().size() != 1)
THROW_IE_EXCEPTION << "Incorrect number of input nodes.";
if (getChildEdges().empty())
THROW_IE_EXCEPTION << "Incorrect number of output nodes.";
+ axis = splitLayer->_axis;
+ if (axis >= getParentEdgeAt(0)->getDims().ndims())
+ THROW_IE_EXCEPTION << "Invalid value of axis parameter in split layer";
+
// WA. Check applicability and limitations
for (size_t i = 1; i < getCnnLayer()->outData.size(); i++) {
int num_port_connection = getCnnLayer()->outData[i]->inputTo.size();
@@ -72,7 +70,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
if (srcDims.ndims() < 2)
THROW_IE_EXCEPTION << "Split " << getName() << " isn't supported 1d blobs";
- auto num_chanels = 0;
+ auto axis_size = 0;
auto dstFirstDims = getChildEdgeAt(0)->getDims();
for (size_t i = 0; i < outDims.size(); i++) {
auto o_Dims = outDims[i];
@@ -83,15 +81,15 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
config.outConfs[i].inPlace = -1;
config.outConfs[i].constant = false;
config.outConfs[i].desc = MKLDNNMemoryDesc(o_Dims, outputDataType, memory::format::any);
- num_chanels += o_Dims[1];
+ axis_size += o_Dims[axis];
for (size_t j = 0; j < dstFirstDims.ndims(); j++) {
if (j == axis)
continue;
if (o_Dims[j] != dstFirstDims[j])
- THROW_IE_EXCEPTION << "Split " << getName() << "has incorrect output dimensions";
+ THROW_IE_EXCEPTION << "Split " << getName() << " has incorrect output dimensions";
}
}
- dstFirstDims[1] = num_chanels;
+ dstFirstDims[axis] = axis_size;
if (dstFirstDims.size() != srcDims.size())
THROW_IE_EXCEPTION << "The sizes of input blob and sum of output blobs are not equal.";
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
@@ -99,11 +97,10 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
auto numOfDim = static_cast<size_t>(srcDims.ndims());
SizeVector order;
- SizeVector offsets;
+ SizeVector offsets(numOfDim, 0lu);
size_t offset = std::numeric_limits<size_t>::max();
for (size_t i = 0; i < numOfDim; i++) {
order.push_back(i);
- offsets.push_back(0);
}
SizeVector strides(numOfDim);
@@ -125,23 +122,23 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
- if (numOfDim != 4)
+ if ((numOfDim != 4 && numOfDim != 5) || axis != 1)
return;
- order = {0, 1, 2, 3, 1};
- offsets = {0, 0, 0, 0, 0};
- numOfDim = 5;
+ order.push_back(1);
+ numOfDim = order.size();
+ offsets = SizeVector(numOfDim, 0lu);
// nChw8c and nChw16c
- for (int sizeS : {8, 16}) {
+ for (size_t sizeS : {8lu, 16lu}) {
SizeVector blkDims = srcDims.ToSizeVector();
if (blkDims[1] % sizeS)
continue;
- blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+ blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
blkDims.push_back(sizeS);
strides.resize(numOfDim);
- strides[numOfDim - 1] = 1;
+ strides[numOfDim - 1] = 1lu;
for (size_t i = 2; i <= numOfDim; i++) {
if (numOfDim - i < axis) {
strides[numOfDim - i] = std::numeric_limits<size_t>::max();
@@ -160,9 +157,9 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
canInplace = false;
break;
}
- blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+ blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
blkDims.push_back(sizeS);
- config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides});
+ config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides});
}
if (canInplace)
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
@@ -190,18 +187,32 @@ void MKLDNNSplitNode::execute(mkldnn::stream strm) {
int MB = batchToProcess();
auto srcBlob = getParentEdgeAt(0)->getBlob();
const auto *srcData = srcBlob->cbuffer().as<const float *>();
+
+ size_t outerSize = 1;
+ for (int i = 0; i < axis; i++) {
+ if (i == 0)
+ outerSize *= MB;
+ else
+ outerSize *= srcBlob->dims()[srcBlob->dims().size() - i - 1];
+ }
+
size_t srcSize = getParentEdgeAt(0)->getMemory().GetSize();
- size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / srcBlob->getTensorDesc().getDims()[0])
+ size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / outerSize)
- srcBlob->getTensorDesc().offset(0);
for (size_t i = 0, sIdx = 0; i < getChildEdges().size(); i++) {
auto dstBlob = getChildEdgeAt(i)->getBlob();
auto *dstData = dstBlob->buffer().as<float *>();
- size_t dst_slice_size = dstBlob->size() / dstBlob->getTensorDesc().getDims()[0];
- size_t dst_batch_off = dstBlob->getTensorDesc().offset(dst_slice_size) - dstBlob->getTensorDesc().offset(0);
- for (size_t dIdx = 0; dIdx < dst_slice_size; dIdx++, sIdx++) {
- for (unsigned b = 0; b < MB; b++) {
+ size_t innerSize = 1;
+ for (size_t j = axis; j < dstBlob->dims().size(); j++) {
+ innerSize *= dstBlob->dims()[dstBlob->dims().size() - j - 1];
+ }
+
+ size_t dst_batch_off = dstBlob->getTensorDesc().offset(innerSize) - dstBlob->getTensorDesc().offset(0);
+
+ for (size_t dIdx = 0; dIdx < innerSize; dIdx++, sIdx++) {
+ for (unsigned b = 0; b < outerSize; b++) {
if (sIdx + b*src_batch_off >= srcSize)
THROW_IE_EXCEPTION << "Incorrect configuration of split layer " << getName() << "!";
dstData[b * dst_batch_off + dstBlob->getTensorDesc().offset(dIdx)] =
@@ -436,3 +447,13 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() {
}
initDescriptor(config);
}
+
+void MKLDNNSplitNode::setDynamicBatchLim(int lim) {
+ if (axis == 0)
+ THROW_IE_EXCEPTION << "Dynamic batch is not supported by split layer with axis == 0 parameter";
+
+ dynBatchLim = lim;
+ if (prim) {
+ prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
+ }
+}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
index 7d4157768..905f8069c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -26,6 +25,8 @@ public:
bool isOptimized();
void initOptimalPrimitiveDescriptor() override;
+ void setDynamicBatchLim(int lim) override;
+
private:
static Register<MKLDNNSplitNode> reg;
size_t axis = 1;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
index 204ea868d..122671681 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
@@ -49,9 +48,11 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() {
fmt = memory::format::nc;
} else if (inDims.ndims() == 4) {
fmt = memory::format::nchw;
+ } else if (inDims.ndims() == 5) {
+ fmt = memory::format::ncdhw;
}
if (fmt == memory::format::any) {
- THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2d and 4d dimensions!";
+ THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2D, 4D and 5D dimensions!";
}
InferenceEngine::LayerConfig config;
@@ -101,14 +102,16 @@ void MKLDNNTileNode::execute(mkldnn::stream strm) {
m_inner_dim *= batchToProcess();
}
- if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%8 == 0 && srcMemory.GetFormat() == memory::nChw8c) {
+ if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw8c) ||
+ (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw8c))) {
/*
* We may enable tile processing directly to appropriate output format (nChw8c)
*/
m_inner_dim *= 8;
m_outer_dim /= 8;
- } else if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%16 == 0
- && srcMemory.GetFormat() == memory::nChw16c) {
+ } else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 &&
+ ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw16c) ||
+ (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw16c))) {
/*
* We may enable tile processing directly to appropriate output format (nChw16c)
*/
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
index 464c15017..d6a75941f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/perf_count.h b/inference-engine/src/mkldnn_plugin/perf_count.h
index 87f0c5ffc..3770a2435 100644
--- a/inference-engine/src/mkldnn_plugin/perf_count.h
+++ b/inference-engine/src/mkldnn_plugin/perf_count.h
@@ -1,5 +1,4 @@
// Copyright (C) 2018 Intel Corporation
-//
// SPDX-License-Identifier: Apache-2.0
//
diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
new file mode 100644
index 000000000..24d2931af
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
@@ -0,0 +1,370 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "blob_dump.h"
+#include "blob_factory.hpp"
+#include "mkldnn_memory.h"
+
+// It's so bad to include by relative path :-(
+#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
+
+#include <fstream>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+// IEB file format routine
+static unsigned char IEB_MAGIC[4] = {'I', 'E', 'B', '0'};
+static unsigned char NO_SCALES = 0xFF;
+
+struct IEB_HEADER {
+ unsigned char magic[4];
+ unsigned char ver[2];
+
+ unsigned char precision; // 0-8
+ unsigned char ndims;
+ unsigned int dims[7]; // max is 7-D blob
+
+ unsigned char scaling_axis; // FF - no scaling
+ unsigned char reserved[3];
+
+ unsigned long data_offset;
+ unsigned long data_size;
+ unsigned long scaling_data_offset;
+ unsigned long scaling_data_size;
+};
+
+static IEB_HEADER prepare_header(const TensorDesc& desc) {
+ IEB_HEADER header;
+
+ header.magic[0] = IEB_MAGIC[0];
+ header.magic[1] = IEB_MAGIC[1];
+ header.magic[2] = IEB_MAGIC[2];
+ header.magic[3] = IEB_MAGIC[3];
+
+ // IEB file format version 0.1
+ header.ver[0] = 0;
+ header.ver[1] = 1;
+
+ header.precision = desc.getPrecision();
+
+ if (desc.getDims().size() > 7)
+ THROW_IE_EXCEPTION << "Dumper support max 7D blobs";
+
+ header.ndims = desc.getDims().size();
+ for (int i = 0; i < header.ndims; i++)
+ header.dims[i] = desc.getDims()[i];
+
+ header.scaling_axis = NO_SCALES;
+
+ return header;
+}
+
+static TensorDesc parse_header(IEB_HEADER &header) {
+ if (header.magic[0] != IEB_MAGIC[0] ||
+ header.magic[1] != IEB_MAGIC[1] ||
+ header.magic[2] != IEB_MAGIC[2] ||
+ header.magic[3] != IEB_MAGIC[3])
+ THROW_IE_EXCEPTION << "Dumper cannot parse file. Wrong format.";
+
+ if (header.ver[0] != 0 ||
+ header.ver[1] != 1)
+ THROW_IE_EXCEPTION << "Dumper cannot parse file. Unsupported IEB format version.";
+
+ Precision prc = Precision(static_cast<Precision::ePrecision>(header.precision));
+ SizeVector dims(header.ndims);
+ for (int i = 0; i < header.ndims; i++)
+ dims[i] = header.dims[i];
+
+ return TensorDesc {prc, dims, plain_layout(dims)};
+}
+
+
+bool is_plain(Blob::Ptr blob) {
+ bool res = true;
+
+ auto orig_strides = blob->getTensorDesc().getBlockingDesc().getStrides();
+ auto orig_order = blob->getTensorDesc().getBlockingDesc().getOrder();
+ auto dims = blob->getTensorDesc().getDims();
+
+ for (int stride = 1, i = dims.size()-1; i >= 0; --i) {
+ if (stride != orig_strides[i] || i != orig_order[i]) res = false;
+ stride *= dims[i];
+ }
+
+ return res;
+}
+
+static Blob::Ptr prepare_plain_data(Blob::Ptr blob) {
+ // check if it already plain
+ if (is_plain(blob)) return blob;
+
+ Blob::Ptr pln_blob = make_plain_blob(blob->precision(), blob->getTensorDesc().getDims());
+ pln_blob->allocate();
+
+ // Copy to plain
+ MKLDNNMemoryDesc mdesc(blob->getTensorDesc());
+ mkldnn::memory::desc desc = mdesc;
+ mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data);
+
+ int data_size = blob->size();
+
+ // TODO: make it with blob_copy utility
+ switch (blob->precision()) {
+ case Precision::FP32:
+ case Precision::I32: {
+ int32_t *pln_blob_ptr = pln_blob->buffer().as<int32_t*>();
+ int32_t *blob_ptr = blob->buffer().as<int32_t*>();
+ for (size_t i = 0; i < data_size; i++)
+ pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+ break;
+ }
+ case Precision::I16:
+ case Precision::U16: {
+ int16_t *pln_blob_ptr = pln_blob->buffer().as<int16_t*>();
+ int16_t *blob_ptr = blob->buffer().as<int16_t *>();
+ for (size_t i = 0; i < data_size; i++)
+ pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+ break;
+ }
+ case Precision::I8:
+ case Precision::U8: {
+ int8_t *pln_blob_ptr = pln_blob->buffer().as<int8_t*>();
+ int8_t *blob_ptr = blob->buffer().as<int8_t *>();
+ for (size_t i = 0; i < data_size; i++)
+ pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+ break;
+ }
+ default:
+ THROW_IE_EXCEPTION << "Dumper. Unsupported precision";
+ }
+
+ return pln_blob;
+}
+
+void BlobDumper::dump(std::ostream &stream) {
+ if (!_blob)
+ THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob";
+
+ if (_blob->buffer().as<float*>() == nullptr)
+ THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated.";
+
+ IEB_HEADER header = prepare_header(_blob->getTensorDesc());
+ Blob::Ptr pln_blob = prepare_plain_data(_blob);
+
+ header.data_offset = sizeof(header);
+ header.data_size = pln_blob->byteSize();
+ header.scaling_data_offset = 0;
+ header.scaling_data_size = 0;
+
+ if (_scales) {
+ header.scaling_axis = 1;
+ header.scaling_data_offset = header.data_offset + header.data_size;
+ header.scaling_data_size = _scales->byteSize();
+ }
+
+ stream.write(reinterpret_cast<char*>(&header), sizeof(header));
+ stream.write(pln_blob->buffer().as<char*>(), pln_blob->byteSize());
+
+ if (_scales) {
+ stream.write(_scales->buffer().as<char*>(), _scales->byteSize());
+ }
+}
+
+void BlobDumper::dumpAsTxt(std::ostream &stream) {
+ if (!_blob)
+ THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob";
+
+ if (_blob->buffer().as<float*>() == nullptr)
+ THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated.";
+
+ SizeVector dims = _blob->getTensorDesc().getDims();
+
+ // Header like "U8 4D shape: 2 3 224 224 ()
+ stream << _blob->precision().name() << " "
+ << dims.size() << "D "
+ << "shape: ";
+ for (size_t d : dims) stream << d << " ";
+ stream << "(" << _blob->size() << ")" <<std::endl;
+
+ // Dump data
+ MKLDNNMemoryDesc mdesc(_blob->getTensorDesc());
+ mkldnn::memory::desc desc = mdesc;
+ mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data);
+
+ int data_size = _blob->size();
+ switch (_blob->precision()) {
+ case Precision::FP32: {
+ auto *blob_ptr = _blob->buffer().as<float*>();
+ for (size_t i = 0; i < data_size; i++)
+ stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
+ break;
+ }
+ case Precision::I32: {
+ auto *blob_ptr = _blob->buffer().as<int32_t*>();
+ for (size_t i = 0; i < data_size; i++)
+ stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
+ break;
+ }
+ case Precision::I16: {
+ auto *blob_ptr = _blob->buffer().as<int16_t*>();
+ for (size_t i = 0; i < data_size; i++)
+ stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+ break;
+ }
+ case Precision::U16: {
+ auto *blob_ptr = _blob->buffer().as<uint16_t*>();
+ for (size_t i = 0; i < data_size; i++)
+ stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+ break;
+ }
+ case Precision::I8: {
+ auto *blob_ptr = _blob->buffer().as<int8_t*>();
+ for (size_t i = 0; i < data_size; i++)
+ stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+ break;
+ }
+ case Precision::U8: {
+ auto *blob_ptr = _blob->buffer().as<uint8_t*>();
+ for (size_t i = 0; i < data_size; i++)
+ stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+ break;
+ }
+ default:
+ THROW_IE_EXCEPTION << "Dumper. Unsupported precision";
+ }
+}
+
+BlobDumper BlobDumper::read(std::istream &stream) {
+ IEB_HEADER header;
+ stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+
+ TensorDesc desc = parse_header(header);
+ Blob::Ptr blob = make_blob_with_precision(desc);
+ blob->allocate();
+
+ stream.seekg(header.data_offset, stream.beg);
+ stream.read(blob->buffer().as<char*>(), header.data_size);
+
+ BlobDumper res(blob);
+
+ // Parse scales fields.
+ if (header.scaling_axis != NO_SCALES) {
+ if (header.scaling_axis != 1)
+ THROW_IE_EXCEPTION << "Dumper support scaling only for channel dims.";
+
+ size_t scl_size = header.scaling_data_size / sizeof(float);
+ auto scl = make_blob_with_precision({Precision::FP32, {scl_size}, C});
+ scl->allocate();
+
+ stream.seekg(header.scaling_data_offset, stream.beg);
+ stream.read(scl->buffer().as<char*>(), header.scaling_data_size);
+
+ res._scales = scl;
+ }
+ return res;
+}
+
+BlobDumper BlobDumper::read(const std::string &file_path) {
+ std::ifstream file;
+ file.open(file_path);
+ if (!file.is_open())
+ THROW_IE_EXCEPTION << "Dumper cannot open file " << file_path;
+
+ auto res = read(file);
+ file.close();
+ return res;
+}
+
+void BlobDumper::dump(const std::string &dump_path) {
+ std::ofstream dump_file;
+ dump_file.open(dump_path);
+ if (!dump_file.is_open())
+ THROW_IE_EXCEPTION << "Dumper cannot create dump file";
+
+ dump(dump_file);
+ dump_file.close();
+}
+
+void BlobDumper::dumpAsTxt(const std::string dump_path) {
+ std::ofstream dump_file;
+ dump_file.open(dump_path);
+ if (!dump_file.is_open())
+ THROW_IE_EXCEPTION << "Dumper cannot create dump file";
+
+ dumpAsTxt(dump_file);
+ dump_file.close();
+}
+
+Blob::Ptr BlobDumper::get() {
+ return _blob;
+}
+
+template <typename data_t>
+static void plain_copy(const Blob::Ptr &from, const Blob::Ptr &scls, Blob::Ptr &to) {
+ auto dims = from->getTensorDesc().getDims();
+
+ size_t data_size = from->size();
+ size_t outer_size = dims[0];
+ size_t c_size = dims.size() > 1 ? dims[1] : 1;
+ size_t inner_size = dims.size() == 4 ? dims[2]*dims[3] :
+ dims.size() == 3 ? dims[2] : 1;
+
+ auto to_data = to->buffer().as<float*>();
+ auto from_data = from->buffer().as<data_t*>();
+
+ if (scls) {
+ auto scls_data = scls->buffer().as<float*>();
+
+ for (size_t o=0; o < outer_size; o++)
+ for (size_t c=0; c < c_size; c++)
+ for (size_t i=0; i < inner_size; i++)
+ *to_data++ = static_cast<float>(*from_data++) * scls_data[c];
+ } else {
+ for (size_t i=0; i < data_size; i++)
+ *to_data++ = static_cast<float>(*from_data++);
+ }
+}
+
+Blob::Ptr BlobDumper::getRealValue() {
+ if (_blob->precision() == Precision::FP32 && !_scales)
+ return _blob;
+
+ auto res = make_plain_blob(Precision::FP32, _blob->getTensorDesc().getDims());
+ res->allocate();
+
+ switch (_blob->precision()) {
+ case Precision::U8: plain_copy<uint8_t>(_blob, _scales, res); break;
+ case Precision::FP32: plain_copy<float>(_blob, _scales, res); break;
+ case Precision::I8: plain_copy<int8_t >(_blob, _scales, res); break;
+ default: THROW_IE_EXCEPTION << "Unsupported precesion for getRealValue method.";
+ }
+
+ return res;
+}
+
+
+BlobDumper& BlobDumper::withScales(InferenceEngine::Blob::Ptr scales) {
+ if ( _blob->getTensorDesc().getDims().size() < 2 ||
+ scales->getTensorDesc().getDims().size() != 1 ||
+ scales->getTensorDesc().getDims()[0] != _blob->getTensorDesc().getDims()[1] ||
+ scales->getTensorDesc().getPrecision() != Precision::FP32)
+ THROW_IE_EXCEPTION << "Dumper cannot use passed scales. Blob has incompatible shape.";
+
+ _scales = scales;
+ return *this;
+}
+
+BlobDumper& BlobDumper::withoutScales() {
+ _scales.reset();
+ return *this;
+}
+
+
+const InferenceEngine::Blob::Ptr& BlobDumper::getScales() const {
+ return _scales;
+}
+
+} // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.h b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h
new file mode 100644
index 000000000..4130d53a7
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_blob.h"
+
+#include <string>
+
+namespace MKLDNNPlugin {
+
+/**
+ * Utility class to dump blob contant in plain format.
+ * Every layout information will be lost.
+ *
+ * In case of low precision blob it allow to store
+ * with using scaling factors per channel.
+ * NB! Channel is a second dimension for all blob types.
+ */
+class BlobDumper {
+ InferenceEngine::Blob::Ptr _blob;
+ InferenceEngine::Blob::Ptr _scales;
+
+public:
+ BlobDumper() = default;
+ BlobDumper(const BlobDumper&) = default;
+ BlobDumper& operator = (BlobDumper&&) = default;
+
+ explicit BlobDumper(const InferenceEngine::Blob::Ptr blob):_blob(blob) {}
+
+ static BlobDumper read(const std::string &file_path);
+ static BlobDumper read(std::istream &stream);
+
+ void dump(const std::string &file_path);
+ void dump(std::ostream &stream);
+
+ void dumpAsTxt(const std::string file_path);
+ void dumpAsTxt(std::ostream &stream);
+
+ BlobDumper& withScales(InferenceEngine::Blob::Ptr scales);
+ BlobDumper& withoutScales();
+
+ const InferenceEngine::Blob::Ptr& getScales() const;
+
+ InferenceEngine::Blob::Ptr get();
+ InferenceEngine::Blob::Ptr getRealValue();
+};
+
+} // namespace MKLDNNPlugin