Publishing R5 content (#72)

* Publishing R5 content * Updated ade revision * updated readme * add possibility to build CPU plugin with Intel MKL package
author: Alexey Suhov <asuhov@users.noreply.github.com> 2019-01-21 21:31:31 +0300
committer: openvino-pushbot <44090433+openvino-pushbot@users.noreply.github.com> 2019-01-21 21:31:31 +0300
commit: 9de27f16bc8b712a5b8c99d1d4b4a66c9144942d (patch)
tree: 01a383efe94d92b9870d513c2c5ea5d15b07010a /inference-engine/src/mkldnn_plugin
parent: fbc7a4a710c24def8ab199926a7da90a0394b87d (diff)
download: dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.gz
dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.bz2
dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.zip
97 files changed, 2794 insertions, 1568 deletions
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
index 79551f636..5997f7d4b 100644
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 set(TARGET_NAME "MKLDNNPlugin")
 
 if (UNIX AND NOT APPLE)
@@ -25,9 +26,7 @@ file(GLOB HEADERS
 
 addVersionDefines(mkldnn_plugin.cpp CI_BUILD_NUMBER MKL_VERSION)
 
-if(WIN32)
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
-endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
 
 include_directories(
         ${IE_MAIN_SOURCE_DIR}/include
@@ -38,39 +37,30 @@ include_directories(
         ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include
 )
 
+if (GEMM STREQUAL "MKL")
+    log_rpath_from_dir(MKL "${MKL}/lib")
+endif()
+
 add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
+set_ie_threading_interface_for(${TARGET_NAME})
 
 if (THREADING STREQUAL "TBB")
-    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB)
-    target_include_directories(${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
-    target_link_libraries(${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
+    set(MKLDNN_THR MKLDNN_THR_TBB)
 elseif (THREADING STREQUAL "OMP")
-    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP)
-    enable_omp()
-    if(ENABLE_INTEL_OMP)
-        target_link_libraries(${TARGET_NAME} ${intel_omp_lib})
-    endif()
+    set(MKLDNN_THR MKLDNN_THR_OMP)
 else()
-    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ)
+    set(MKLDNN_THR MKLDNN_THR_SEQ)
 endif()
 
-target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} mkldnn)
+target_compile_definitions(${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
+target_link_libraries(${TARGET_NAME} PRIVATE inference_engine ${INTEL_ITT_LIBS} mkldnn)
+
 set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
 
 add_library(test_${TARGET_NAME} STATIC ${SOURCES} ${HEADERS})
+set_ie_threading_interface_for(test_${TARGET_NAME})
 
-if (THREADING STREQUAL "TBB")
-    target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB)
-    target_include_directories(test_${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
-    target_link_libraries(test_${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
-elseif (THREADING STREQUAL "OMP")
-    target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP)
-    if(ENABLE_INTEL_OMP)
-        target_link_libraries(test_${TARGET_NAME} ${intel_omp_lib})
-    endif()
-else()
-    target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ)
-endif()
+target_compile_definitions(test_${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
+target_link_libraries(test_${TARGET_NAME} PRIVATE inference_engine_s mkldnn)
 
-target_link_libraries(test_${TARGET_NAME} inference_engine_s mkldnn)
 set_target_properties(test_${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME test_${TARGET_NAME})
diff --git a/inference-engine/src/mkldnn_plugin/config.cpp b/inference-engine/src/mkldnn_plugin/config.cpp
index 57c8dc9c8..4ef10eec2 100644
--- a/inference-engine/src/mkldnn_plugin/config.cpp
+++ b/inference-engine/src/mkldnn_plugin/config.cpp
@@ -1,16 +1,23 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
+// avoiding clash of the "max" macro with std::max
+#define NOMINMAX
+
 #include "config.h"
 #include "ie_plugin_config.hpp"
 #include "ie_common.h"
 
 #include <string>
+#include <cstring>
 #include <map>
 #include <algorithm>
+#include <stdexcept>
+
 #include <cpp_interfaces/exception2status.hpp>
+#include <thread>
+#include "mkldnn/omp_manager.h"
 
 namespace MKLDNNPlugin {
 
@@ -44,6 +51,42 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
             else
                 THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS
                                    << ". Expected only YES/NO";
+        } else if (key == PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) {
+            if (val == PluginConfigParams::CPU_THROUGHPUT_NUMA) {
+                throughputStreams = MKLDNNPlugin::cpu::getNumberOfCPUSockets();
+            } else if (val == PluginConfigParams::CPU_THROUGHPUT_AUTO) {
+                // bare minimum of streams (that evenly divides available number of core)
+                const int num_cores = std::thread::hardware_concurrency();
+                if (0 == num_cores % 4)
+                    throughputStreams = std::max(4, num_cores / 4);
+                else if (0 == num_cores % 5)
+                    throughputStreams = std::max(5, num_cores / 5);
+                else if (0 == num_cores % 3)
+                    throughputStreams = std::max(3, num_cores / 3);
+                else  // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide
+                    throughputStreams = 1;
+            } else {
+                int val_i;
+                try {
+                    val_i = std::stoi(val);
+                } catch (const std::exception&) {
+                    THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS
+                                       << ". Expected only positive numbers (#streams) or "
+                                       << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO";
+                }
+                if (val_i > 0)
+                    throughputStreams = val_i;
+            }
+        } else if (key == PluginConfigParams::KEY_CPU_THREADS_NUM) {
+            int val_i;
+            try {
+                val_i = std::stoi(val);
+            } catch (const std::exception&) {
+                THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THREADS_NUM
+                                   << ". Expected only positive numbers (#threads)";
+            }
+            if (val_i > 0)
+                threadsNum = val_i;
         } else if (key.compare(PluginConfigParams::KEY_DYN_BATCH_ENABLED) == 0) {
             if (val.compare(PluginConfigParams::YES) == 0)
                 enableDynamicBatch = true;
@@ -52,10 +95,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
             else
                 THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_DYN_BATCH_ENABLED
                 << ". Expected only YES/NO";
+        } else if (key.compare(PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT) == 0) {
+            // empty string means that dumping is switched off
+            dumpToDot = val;
         } else {
             THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property " << key << " by CPU plugin";
         }
     }
+    if (exclusiveAsyncRequests)  // Exclusive request feature disables the streams
+        throughputStreams = 1;
 }
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/config.h b/inference-engine/src/mkldnn_plugin/config.h
index 0bb390c51..558ac87ae 100644
--- a/inference-engine/src/mkldnn_plugin/config.h
+++ b/inference-engine/src/mkldnn_plugin/config.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -15,7 +14,10 @@ struct Config {
     bool collectPerfCounters = false;
     bool exclusiveAsyncRequests = false;
     bool enableDynamicBatch = false;
+    std::string dumpToDot = "";
     int batchLimit = 0;
+    int throughputStreams = 1;
+    int threadsNum = 0;
 
     void readProperties(const std::map<std::string, std::string> &config);
 };
diff --git a/inference-engine/src/mkldnn_plugin/mean_image.cpp b/inference-engine/src/mkldnn_plugin/mean_image.cpp
index ff87e1466..f1ac17e9a 100644
--- a/inference-engine/src/mkldnn_plugin/mean_image.cpp
+++ b/inference-engine/src/mkldnn_plugin/mean_image.cpp
@@ -1,10 +1,10 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mean_image.h"
 #include "ie_parallel.hpp"
+#include "ie_memcpy.h"
 
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
@@ -54,7 +54,8 @@ void MeanImage::Load(const MKLDNNDims& inputDims, InputInfo::Ptr inputInfo) {
                     THROW_IE_EXCEPTION << "mean image size does not match expected network input, expecting " << meanWidth << " x " << meanHeight;
                 }
                 // todo: cast to TBlob and make sure it is floats
-                memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBlob->buffer(), meanBlob->byteSize());
+                ie_memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBuffer->byteSize() - channel*meanBlob->byteSize(),
+                          meanBlob->buffer(), meanBlob->byteSize());
             }
         }
             break;
diff --git a/inference-engine/src/mkldnn_plugin/mean_image.h b/inference-engine/src/mkldnn_plugin/mean_image.h
index c27d667e2..24dc8163a 100644
--- a/inference-engine/src/mkldnn_plugin/mean_image.h
+++ b/inference-engine/src/mkldnn_plugin/mean_image.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
index 3bfae03a0..09ec76c42 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
index 0643d99a7..b3ad3c0c5 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
index 9cc57f3df..616f517aa 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
index d0b911799..57b6edc35 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -13,10 +12,6 @@
 
 namespace mkldnn {
 
-template <> struct handle_traits<mkldnn_primitive_desc_iterator_t> {
-    static constexpr auto destructor = &mkldnn_primitive_desc_iterator_destroy;
-};
-
 struct primitive_desc_iterator : public handle<mkldnn_primitive_desc_iterator_t> {
     template <typename T>
     primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr, const engine &aengine) {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
index 834f8bd6e..ff3616a44 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -33,6 +32,7 @@ impl_desc_type MKLDNNPlugin::parse_impl_name(std::string impl_desc_name) {
     res = static_cast<impl_desc_type>(res | impl_desc_type::_key);
 
     SEARCH_WORD_2(nchw, ref);
+    SEARCH_WORD_2(ncdhw, ref);
     SEARCH_WORD_2(wino, winograd);
 #undef SEARCH_WORD_2
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
index 75a618927..45cca0402 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp
new file mode 100644
index 000000000..19bc513f6
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstdlib>
+#include <cstring>
+#include "ie_parallel.hpp"
+#include "omp_manager.h"
+
+using namespace MKLDNNPlugin;
+namespace MKLDNNPlugin {
+namespace cpu {
+
+static const char *openMpEnvVars[] = {
+        "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
+        "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
+        "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
+        "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
+        "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
+        "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
+        "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
+        "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
+        "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
+};
+
+static const unsigned numberOfOpenMpEnvVars =
+        sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
+
+bool checkOpenMpEnvVars(bool includeOMPNumThreads) {
+    for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
+        if (getenv(openMpEnvVars[i])) {
+            if (0 != strcmp(openMpEnvVars[i], "OMP_NUM_THREADS") || includeOMPNumThreads)
+                return true;
+        }
+    }
+    return false;
+}
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+// getNumberOfCPUSockets/getNumberOfCPUCores are implemented in the lin_omp_manager.cpp
+#else
+int getNumberOfCPUSockets() {return 1;}
+int getNumberOfCPUCores()   {return parallel_get_max_threads();}
+#endif
+
+}  // namespace cpu
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
index 26cba003e..65cc216e4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,10 +8,15 @@
  */
 #pragma once
 
-#ifdef _WIN32
-    #include "mkldnn/os/win/win_omp_manager.h"
-#elif defined(__APPLE__)
-    #include "mkldnn/os/osx/osx_omp_manager.h"
-#else
-    #include "mkldnn/os/lin/lin_omp_manager.h"
-#endif
+namespace MKLDNNPlugin {
+namespace cpu {
+
+bool checkOpenMpEnvVars(bool includeOMPNumThreads = true);
+// numbers of CPU sockets in the machine (on Linux), 1 on all other OSes
+int getNumberOfCPUSockets();
+// numbers of CPU physical cores on Linux (which is considered to be more performance friendly for servers)
+// (on other OSes it simply relies on the original parallel API of choice, which usually use the logical cores )
+int getNumberOfCPUCores();
+
+}  // namespace cpu
+}  // namespace MKLDNNPlugin
+\ No newline at end of file
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
index 75f2e4c6b..14c3e1d1d 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
@@ -1,10 +1,8 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "lin_omp_manager.h"
-#include "ie_parallel.hpp"
 #include <fstream>
 #include <set>
 #include <string>
@@ -19,20 +17,13 @@ namespace cpu {
 Processor::Processor() {
     processor = 0;
     physicalId = 0;
-    siblings = 0;
-    coreId = 0;
     cpuCores = 0;
-    speedMHz = 0;
 }
 
 CpuInfo::CpuInfo() {
     loadContentFromFile("/proc/cpuinfo");
 }
 
-CpuInfo::CpuInfo(const char *content) {
-    loadContent(content);
-}
-
 void CpuInfo::loadContentFromFile(const char *fileName) {
     std::ifstream file(fileName);
     std::string content(
@@ -98,10 +89,6 @@ Collection::Collection(CpuInfoInterface *cpuInfo) : cpuInfo(*cpuInfo) {
     collectBasicCpuInformation();
 }
 
-unsigned Collection::getProcessorSpeedMHz() {
-    return processors.size() ? processors[0].speedMHz : 0;
-}
-
 unsigned Collection::getTotalNumberOfSockets() {
     return totalNumberOfSockets;
 }
@@ -114,10 +101,6 @@ unsigned Collection::getNumberOfProcessors() {
     return processors.size();
 }
 
-const Processor &Collection::getProcessor(unsigned processorId) {
-    return processors[processorId];
-}
-
 void Collection::parseCpuInfo() {
     const char *cpuInfoLine = cpuInfo.getFirstLine();
     for (; cpuInfoLine; cpuInfoLine = cpuInfo.getNextLine()) {
@@ -148,21 +131,9 @@ void Collection::parseValue(const char *fieldName, const char *valueString) {
         currentProcessor->physicalId = parseInteger(valueString);
     }
 
-    if (beginsWith(fieldName, "siblings")) {
-        currentProcessor->siblings = parseInteger(valueString);
-    }
-
-    if (beginsWith(fieldName, "core id")) {
-        currentProcessor->coreId = parseInteger(valueString);
-    }
-
     if (beginsWith(fieldName, "cpu cores")) {
         currentProcessor->cpuCores = parseInteger(valueString);
     }
-
-    if (beginsWith(fieldName, "model name")) {
-        currentProcessor->speedMHz = extractSpeedFromModelName(valueString);
-    }
 }
 
 void Collection::appendNewProcessor() {
@@ -184,32 +155,6 @@ unsigned Collection::parseInteger(const char *text) const {
     return atol(text);
 }
 
-/* Function extracts CPU speed from model name. If unit is not set it is
-   assumed that values below 100 are specified in GHz, otherwise MHz */
-unsigned Collection::extractSpeedFromModelName(const char *text) const {
-    text = strstr(text, "@");
-    if (!text) {
-        return 0;
-    }
-
-    char *unit;
-    double speed = strtod(&text[1], &unit);
-
-    while (isspace(*unit)) {
-        unit++;
-    }
-
-    bool isMHz = !strncmp(unit, "MHz", 3);
-    bool isGHz = !strncmp(unit, "GHz", 3);
-    bool isGHzPossible = (speed < 100);
-
-    if (isGHz || (isGHzPossible && !isMHz)) {
-        return 1000 * speed + 0.5;
-    } else {
-        return speed + 0.5;
-    }
-}
-
 void Collection::collectBasicCpuInformation() {
     std::set<unsigned> uniquePhysicalId;
     std::vector<Processor>::iterator processor = processors.begin();
@@ -229,120 +174,27 @@ void Collection::updateCpuInformation(const Processor &processor,
     totalNumberOfCpuCores += processor.cpuCores;
 }
 
-
-/* The OpenMpManager class is responsible for determining a set of all of
-   available CPU cores and delegating each core to perform other tasks. The
-   first of available cores is delegated for background threads, while other
-   remaining cores are dedicated for OpenMP threads. Each OpenMP thread owns
-   one core for exclusive use. The number of OpenMP threads is then limited
-   to the number of available cores minus one. The amount of CPU cores may
-   be limited by system eg. when numactl was used. */
 #include <sched.h>
 
-static const char *openMpEnvVars[] = {
-        "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
-        "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
-        "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
-        "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
-        "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
-        "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
-        "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
-        "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
-        "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
-};
-
-static const unsigned numberOfOpenMpEnvVars =
-        sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
-
-OpenMpManager::OpenMpManager(Collection *collection) :
-        collection(*collection), isGpuEnabled(false) {
-    getOpenMpEnvVars();
-    getCurrentCpuSet();
-    getCurrentCoreSet();
-}
-
-OpenMpManager &OpenMpManager::getInstance() {
+int getNumberOfCPUSockets() {
     static CpuInfo cpuInfo;
     static Collection collection(&cpuInfo);
-    static OpenMpManager openMpManager(&collection);
-    return openMpManager;
-}
-
-void OpenMpManager::setGpuEnabled() {
-    OpenMpManager &openMpManager = getInstance();
-    openMpManager.isGpuEnabled = true;
-}
-
-void OpenMpManager::setGpuDisabled() {
-    OpenMpManager &openMpManager = getInstance();
-    openMpManager.isGpuEnabled = false;
-}
-
-// Ideally bind given thread to secondary logical core, if
-// only one thread exists then bind to primary one
-void OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible() {
-    OpenMpManager &openMpManager = getInstance();
-    if (openMpManager.isThreadsBindAllowed()) {
-        int totalNumberOfAvailableCores = CPU_COUNT(&openMpManager.currentCoreSet);
-        int logicalCoreToBindTo = totalNumberOfAvailableCores > 1 ? 1 : 0;
-        openMpManager.bindCurrentThreadToLogicalCoreCpus(logicalCoreToBindTo);
-    }
+    return collection.getTotalNumberOfSockets();
 }
 
-void OpenMpManager::bindOpenMpThreads(int env_cores) {
-    OpenMpManager &openMpManager = getInstance();
-
-    if (!openMpManager.isThreadsBindAllowed())
-        return;
-
-    openMpManager.setOpenMpThreadNumberLimit(env_cores);
-    InferenceEngine::parallel_nt(0, [&] (unsigned logicalCoreId, int nthr) {
-        openMpManager.bindCurrentThreadToLogicalCoreCpu(logicalCoreId);
-    });
-}
-
-int OpenMpManager::getOpenMpThreadNumber() {
-    OpenMpManager &openMpManager = getInstance();
-
-    return openMpManager.getCoreNumber();
-}
-
-
-void OpenMpManager::getOpenMpEnvVars() {
-    isAnyOpenMpEnvVarSpecified = false;
-    for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
-        if (getenv(openMpEnvVars[i])) {
-            isAnyOpenMpEnvVarSpecified = true;
-        }
-    }
-}
-
-void OpenMpManager::getCurrentCpuSet() {
-    if (sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet)) {
-        getDefaultCpuSet(&currentCpuSet);
-    }
-}
-
-void OpenMpManager::getDefaultCpuSet(cpu_set_t *defaultCpuSet) {
-    CPU_ZERO(defaultCpuSet);
-    unsigned numberOfProcessors = collection.getNumberOfProcessors();
-    for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
-        CPU_SET(processorId, defaultCpuSet);
-    }
-}
-
-/* Function getCurrentCoreSet() fills currentCoreSet variable with a set of
-   available CPUs, where only one CPU per core is chosen. When multiple CPUs
-   of single core are used, function is selecting only first one of all
-   available. */
-void OpenMpManager::getCurrentCoreSet() {
+int getNumberOfCPUCores() {
+    static CpuInfo cpuInfo;
+    static Collection collection(&cpuInfo);
     unsigned numberOfProcessors = collection.getNumberOfProcessors();
     unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
 
-    cpu_set_t usedCoreSet;
+    cpu_set_t usedCoreSet, currentCoreSet, currentCpuSet;
+    CPU_ZERO(&currentCpuSet);
     CPU_ZERO(&usedCoreSet);
     CPU_ZERO(&currentCoreSet);
 
+    sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet);
+
     for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
         if (CPU_ISSET(processorId, &currentCpuSet)) {
             unsigned coreId = processorId % totalNumberOfCpuCores;
@@ -352,70 +204,9 @@ void OpenMpManager::getCurrentCoreSet() {
             }
         }
     }
-}
-
-void OpenMpManager::selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId) {
-    unsigned numberOfProcessors = collection.getNumberOfProcessors();
-    unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
-
-    int processorId = physicalCoreId % totalNumberOfCpuCores;
-    while (processorId < numberOfProcessors) {
-        if (CPU_ISSET(processorId, &currentCpuSet)) {
-            CPU_SET(processorId, set);
-        }
-
-        processorId += totalNumberOfCpuCores;
-    }
-}
-
-unsigned OpenMpManager::getPhysicalCoreId(unsigned logicalCoreId) {
-    unsigned numberOfProcessors = collection.getNumberOfProcessors();
-
-    for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
-        if (CPU_ISSET(processorId, &currentCoreSet)) {
-            if (!logicalCoreId--) {
-                return processorId;
-            }
-        }
-    }
-
-    std::cerr << "This should never happen!";
-    return 0;
-}
-
-bool OpenMpManager::isThreadsBindAllowed() {
-    return !isAnyOpenMpEnvVarSpecified && !isGpuEnabled;
-}
-
-// Limit of threads to number of logical cores available
-void OpenMpManager::setOpenMpThreadNumberLimit(int env_cores) {
-    parallel_set_num_threads(env_cores == 0 ? CPU_COUNT(&currentCoreSet) : 0);
-}
-
-int OpenMpManager::getCoreNumber() {
     return CPU_COUNT(&currentCoreSet);
 }
 
-void OpenMpManager::bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId) {
-    unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
-#if IE_THREAD == IE_THREAD_OMP
-    cpu_set_t set;
-    CPU_ZERO(&set);
-    CPU_SET(physicalCoreId, &set);
-    sched_setaffinity(0, sizeof(set), &set);
-#endif
-}
-
-void OpenMpManager::bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId) {
-    unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
-#if IE_THREAD == IE_THREAD_OMP
-    cpu_set_t set;
-    CPU_ZERO(&set);
-    selectAllCoreCpus(&set, physicalCoreId);
-    sched_setaffinity(0, sizeof(set), &set);
-#endif
-}
-
 #endif  // #ifndef APPLE
 }  // namespace cpu
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
index d39329a6b..dfd69bbb4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,10 +19,7 @@ namespace cpu {
 struct Processor {
     unsigned processor;
     unsigned physicalId;
-    unsigned siblings;
-    unsigned coreId;
     unsigned cpuCores;
-    unsigned speedMHz;
 
     Processor();
 };
@@ -41,8 +37,6 @@ class CpuInfo : public CpuInfoInterface {
 public:
     CpuInfo();
 
-    explicit CpuInfo(const char *content);
-
     virtual ~CpuInfo();
 
     virtual const char *getFirstLine();
@@ -64,32 +58,17 @@ private:
 class CollectionInterface {
 public:
     virtual ~CollectionInterface() {}
-
-    virtual unsigned getProcessorSpeedMHz() = 0;
-
     virtual unsigned getTotalNumberOfSockets() = 0;
-
-    virtual unsigned getTotalNumberOfCpuCores() = 0;
-
-    virtual unsigned getNumberOfProcessors() = 0;
-
-    virtual const Processor &getProcessor(unsigned processorId) = 0;
 };
 
 class Collection : public CollectionInterface {
 public:
     explicit Collection(CpuInfoInterface *cpuInfo);
 
-    virtual unsigned getProcessorSpeedMHz();
-
     virtual unsigned getTotalNumberOfSockets();
-
     virtual unsigned getTotalNumberOfCpuCores();
-
     virtual unsigned getNumberOfProcessors();
 
-    virtual const Processor &getProcessor(unsigned processorId);
-
 private:
     CpuInfoInterface &cpuInfo;
     unsigned totalNumberOfSockets;
@@ -113,70 +92,11 @@ private:
 
     unsigned parseInteger(const char *text) const;
 
-    unsigned extractSpeedFromModelName(const char *text) const;
-
     void collectBasicCpuInformation();
 
     void updateCpuInformation(const Processor &processor,
                               unsigned numberOfUniquePhysicalId);
 };
-
-
-class OpenMpManager {
-public:
-    static void setGpuEnabled();
-
-    static void setGpuDisabled();
-
-    static void bindCurrentThreadToNonPrimaryCoreIfPossible();
-
-    static void bindOpenMpThreads(int env_cores = 0);
-
-    static int getOpenMpThreadNumber();
-
-    static void printVerboseInformation();
-
-    static bool isMajorThread(int currentThread);
-
-private:
-    Collection &collection;
-
-    bool isGpuEnabled;
-    bool isAnyOpenMpEnvVarSpecified;
-    cpu_set_t currentCpuSet;
-    cpu_set_t currentCoreSet;
-
-    explicit OpenMpManager(Collection *collection);
-
-    OpenMpManager(const OpenMpManager &openMpManager);
-
-    OpenMpManager &operator=(const OpenMpManager &openMpManager);
-
-    static OpenMpManager &getInstance();
-
-    void getOpenMpEnvVars();
-
-    void getCurrentCpuSet();
-
-    int getCoreNumber();
-
-    void getDefaultCpuSet(cpu_set_t *defaultCpuSet);
-
-    void getCurrentCoreSet();
-
-    void selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId);
-
-    unsigned getPhysicalCoreId(unsigned logicalCoreId);
-
-    bool isThreadsBindAllowed();
-
-    void setOpenMpThreadNumberLimit(int env_cores);
-
-    void bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId);
-
-    void bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId);
-};
-
 #endif  // #ifndef __APPLE__
 }  // namespace cpu
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h
deleted file mode 100644
index 0484bb571..000000000
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-/**
-* @brief WINAPI based code
-* @file win_omp_manager.h
-*/
-
-#pragma once
-
-#include <thread>
-#include <vector>
-
-namespace MKLDNNPlugin {
-namespace cpu {
-
-class OpenMpManager {
-public:
-    static int getOpenMpThreadNumber() {
-        return getCoreNumber();
-    }
-
-    static int getCoreNumber() {
-        return 4;
-    }
-};
-
-}  // namespace cpu
-}  // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h
deleted file mode 100644
index d59891679..000000000
--- a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-/**
-* @brief WINAPI based code
-* @file win_omp_manager.h
-*/
-
-#pragma once
-
-#include <thread>
-#include <vector>
-#include <windows.h>
-
-namespace MKLDNNPlugin {
-namespace cpu {
-
-class OpenMpManager {
-public:
-    static int getOpenMpThreadNumber() {
-        return getCoreNumber();
-    }
-
-    static int getCoreNumber() {
-        int num_cores = std::thread::hardware_concurrency();
-        unsigned long size = 0;
-
-        if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size)) {
-            if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
-                std::vector<char> buf(size);
-                SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info
-                        = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(&buf.front());
-                SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* ptr = info;
-                if (GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size)) {
-                    if (GetLastError() == ERROR_SUCCESS) {
-                        int num = 0;
-                        unsigned long offset = 0;
-                        while (offset < size) {
-                            num++;
-                            offset += ptr->Size;
-                            ptr = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(
-                                    reinterpret_cast<byte*>(ptr) + ptr->Size);
-                        }
-                        num_cores = num;
-                    }
-                }
-            }
-        }
-        return num_cores;
-    }
-};
-
-}  // namespace cpu
-}  // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
index e117182dd..ea463a2d5 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
index be18a41e2..447787f88 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
index 9ea3fe38c..bcb47419e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
index 51a29c21e..dff072089 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
index f707f268c..06616a8be 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
index 102955fbb..92c8c5ad3 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
index 91c586b4a..f5364f614 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
index c9ca08ab2..b362433eb 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
index bd1e0d8d5..f3abd8b4a 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
index 681061a4d..3600ee56c 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
index fb7953f17..8b2994e5f 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
index 983fc2b35..9c079efd4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <limits>
 #include <fstream>
 #include <unordered_map>
+#include <memory>
 #include "details/caseless.hpp"
 
 #include "mkldnn_graph.h"
@@ -24,7 +24,6 @@
 #include "mkldnn_extension_utils.h"
 #include "mkldnn_extension_mngr.h"
 #include "mkldnn/omp_manager.h"
-#include "ie_parallel.hpp"
 #include <graph_tools.hpp>
 #include <cpp_interfaces/ie_executor_manager.hpp>
 #include "ie_algorithm.hpp"
@@ -33,21 +32,34 @@
 #include "mkldnn_async_infer_request.h"
 #include <blob_factory.hpp>
 #include <ie_util_internal.hpp>
+#include <net_pass.h>
+
+#include <mkldnn_graph_dumper.h>
 
 #include <data_stats.h>
-#include "../inference_engine/cnn_network_int8_normalizer.hpp"
+#include "cnn_network_int8_normalizer.hpp"
+#include "ie_memcpy.h"
 
 #define XBYAK_NO_OP_NAMES
 #define XBYAK_UNDEF_JNL
 #include "../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h"
 
 #include "cnn_network_stats_impl.hpp"
-// #define DEBUG_DUMP_PATH "/temp/path/dump/"
-// #define DEBUG_DUMP_NEW_FOLDER_PER_INFER
-#ifdef DEBUG_DUMP_PATH
-#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
-#include <iomanip>
-// #define DEBUG_BMP_OUTPUT 1
+
+#include "utils/blob_dump.h"
+
+/*****************************************************
+ * Dump capability
+ * Specify path to dump folder in BLOB_DUMP_PATH
+ *****************************************************/
+// #define BLOB_DUMP_PATH "dump"
+
+#ifdef BLOB_DUMP_PATH
+#   define DUMP_DIR        BLOB_DUMP_PATH
+#   define ENABLE_DUMP(_x) { _x ;}
+#else
+#   define DUMP_DIR ""
+#   define ENABLE_DUMP(_x)
 #endif
 
 using namespace mkldnn;
@@ -56,37 +68,11 @@ using namespace MKLDNNPlugin::cpu;
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 
-void BindThreads(mkldnn::engine eng) {
-    static bool alreadyBind = false;
-    if (!alreadyBind) {
-#if IE_THREAD == IE_THREAD_OMP
-        int env_cores = 0;
-        if (getenv("OMP_NUM_THREADS") != nullptr) {
-            try {
-                env_cores = std::stoi(std::string(getenv("OMP_NUM_THREADS")));
-            } catch (...) {
-                env_cores = 0;
-            }
-        }
-#if !(defined(__APPLE__) || defined(_WIN32))
-        OpenMpManager::setGpuDisabled();
-        OpenMpManager::bindOpenMpThreads(env_cores);
-#else
-        int num_cores = env_cores == 0 ? OpenMpManager::getOpenMpThreadNumber() : env_cores;
-        parallel_set_num_threads(num_cores);
-#endif
-#endif
-        alreadyBind = true;
-    }
-}
-
-void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
+void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
     if (IsReady()) {
         ForgetGraphData();
     }
 
-    if (config.useThreadBinding) BindThreads(eng);
-
     // go over the inputs and create input primitives
     InputsDataMap inputs;
     network.getInputsInfo(inputs);
@@ -273,6 +259,9 @@ void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager
 
     CreatePrimitives();
 
+    // Will do it before cleanup. Because it will lose original layers information
+    if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_init.dot");
+
     for (auto &graphNode : graphNodes) {
         graphNode->cleanup();
     }
@@ -378,15 +367,31 @@ void MKLDNNGraph::ParseNode(const CNNLayerPtr& cnnLayer, MKLDNNNodePtr& parent,
     if (exists)
         return;
 
+    if (cnnLayer->blobs.find("ext-scale") != cnnLayer->blobs.end())
+        node->ext_scales = cnnLayer->blobs["ext-scale"];
+
     graphNodes.push_back(node);
 
     size_t count_out = 0;
+    std::vector<ParsedLayer> remaining;
     for (const auto &layer : cnnLayer->outData) {
+        bool first = true;
         for (const auto &data : layer->getInputTo()) {
-            queuelayers.push_back({node, data.second, count_out});
+            if (first) {
+                queuelayers.push_back({node, data.second, count_out});
+                first = false;
+            } else {
+                // TODO: Just to hide bug with port ordering.
+                //       At first step we visit only first connection
+                //       at port. As second we will visit all remaining.
+                //
+                // Not first connection to the port are stored here
+                remaining.push_back({node, data.second, count_out});
+            }
         }
         count_out++;
     }
+    queuelayers.insert(queuelayers.end(), remaining.begin(), remaining.end());
 }
 
 void MKLDNNGraph::InitNodes() {
@@ -416,58 +421,6 @@ void MKLDNNGraph::InitEdges() {
         if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
             inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
             outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
-        } else if (inArgs.empty() && outArgs.empty()) {
-            // This detailed name disabled by request from ICV team
-#if 0
-            auto parentBlk = parentDesc.getBlockingDesc();
-            auto childBlk = childDesc.getBlockingDesc();
-            std::string order_in, order_out, stride_in, stride_out, dims_in, dims_out, off_in, off_out;
-            for (size_t i = 0; i < parentBlk.getBlockDims().size(); i++) {
-                if (i) {
-                    stride_in += ",";
-                    order_in += ",";
-                    dims_in += ",";
-                    off_in += ",";
-                }
-                stride_in += std::to_string(parentBlk.getStrides()[i]);
-                order_in += std::to_string(parentBlk.getOrder()[i]);
-                dims_in += std::to_string(parentBlk.getBlockDims()[i]);
-                off_in += std::to_string(parentBlk.getOffsetPaddingToData()[i]);
-            }
-            for (size_t i = 0; i < childBlk.getBlockDims().size(); i++) {
-                if (i) {
-                    stride_out += ",";
-                    order_out += ",";
-                    dims_out += ",";
-                    off_out += ",";
-                }
-                stride_out += std::to_string(childBlk.getStrides()[i]);
-                order_out += std::to_string(childBlk.getOrder()[i]);
-                dims_out += std::to_string(childBlk.getBlockDims()[i]);
-                off_out += std::to_string(childBlk.getOffsetPaddingToData()[i]);
-            }
-
-            if (parentBlk.getOffsetPadding() != childBlk.getOffsetPadding()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(parentBlk.getOffsetPadding());
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(childBlk.getOffsetPadding());
-            }
-            if (parentBlk.getStrides() != childBlk.getStrides()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("str:") + stride_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("str:") + stride_out;
-            }
-            if (parentBlk.getOrder() != childBlk.getOrder()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("ord:") + order_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("ord:") + order_out;
-            }
-            if (parentBlk.getBlockDims() != childBlk.getBlockDims()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("dim:") + dims_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("dim:") + dims_out;
-            }
-            if (parentBlk.getOffsetPaddingToData() != childBlk.getOffsetPaddingToData()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("offs:") + off_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("offs:") + off_out;
-            }
-#endif
         }
         return inArgs + "_" + outArgs;
     };
@@ -529,7 +482,7 @@ static inline bool isConstOutput(MKLDNNEdgePtr edge) {
 void MKLDNNGraph::AllocateWithReuse() {
     std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters;
 
-    // detect edge clasters which are view on one.
+    // detect edge clusters which are view on one.
     for (auto &edge : graphEdges) {
         MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated)
                             ? edge->getSharedEdge()
@@ -606,7 +559,7 @@ void MKLDNNGraph::AllocateWithReuse() {
 
             int e_size = block_desk.getOffsetPadding() + 1;  // size in elements (from begin of data to last element)
             for (int j = 0; j < block_desk.getBlockDims().size(); j++)
-                e_size += (block_desk.getBlockDims()[j] - 1 ) * block_desk.getStrides()[j];
+                e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j];
 
             box.start = std::min(e_start, box.start);
             box.finish = std::max(e_finish, box.finish);
@@ -754,139 +707,9 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
             MB_to_process = std::min<int>(config.batchLimit, MB_to_process);
         size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
 
-        memcpy(ext_blob_ptr, intr_blob_ptr, size_to_copy);
-    }
-}
-
-#ifdef DEBUG_BMP_OUTPUT
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "../../thirdparty/stb_lib/stb_image_write.h"
-
-#if defined(_WIN32)
-#define mkdir(dir, mode) _mkdir(dir)
-#endif
-
-void dump_as_bitmaps(const std::string name, const float* data,
-                     const SizeVector& cdims,
-                    mkldnn::impl::memory_format_t format = mkldnn::impl::memory_format::nchw) {
-    std::string dir_name = name + "_bmp_dir/";
-    mkdir(dir_name.c_str(), 0755);
-
-    std::ofstream layer_bmp_log;
-    layer_bmp_log.open(dir_name + "bmp_dump_log.txt");
-    layer_bmp_log << "Format " << format << std::endl;
-
-    if (cdims.size() == 1) {
-        layer_bmp_log << "Only one dimension: " << cdims[0] << std::endl;
-        layer_bmp_log.close();
-        return;
-    }
-
-    SizeVector dims(cdims.rbegin(), cdims.rend());
-
-    size_t x = dims[0], y = dims[1], total_images = 1;
-    size_t img_sz = x*y;
-
-    for (size_t k = 0; k < dims.size(); ++k)
-        if (dims[k])
-            total_images *= dims[k];
-
-    total_images /= img_sz;
-
-    //  sanity checks
-    if (img_sz < 100) {
-        layer_bmp_log << "Image size is too small" << std::endl;
-        layer_bmp_log.close();
-        return;
-    } else if (x < 10 || y < 10 || x > 2048 || y > 2048) {
-        layer_bmp_log << "Dimensions are unapropriate to dump - " << y << "x" << x << std::endl;
-        layer_bmp_log.close();
-        return;
-    } else {
-        float ratio = static_cast<float>(x) / static_cast<float>(y);
-        if (ratio < 1.0) ratio = 1.0 / ratio;
-
-        if (ratio > 8.f) {
-            layer_bmp_log << "Suspicious aspect ratio - " << ratio << std::endl;
-            layer_bmp_log.close();
-            return;
-        }
-    }
-
-    layer_bmp_log << total_images << " images to write ..." << std::endl;
-
-    const float* dataPtr = data;
-    for (size_t img = 0; img < total_images; img++) {
-        std::string img_name = "img" + std::to_string(img) + ".bmp";
-
-        //  copy image plane to separate buffer,
-        //  normalize and convert to 3-channel 8-bit bmp
-        std::vector<float> imgbuf(img_sz);
-        int stride = 1;
-        switch (format) {
-        case mkldnn::impl::memory_format::nChw8c:
-            stride = 8;
-            break;
-        case mkldnn::impl::memory_format::nChw16c:
-            stride = 16;
-            break;
-        case mkldnn::impl::memory_format::nchw:
-        default:
-            break;
-        }
-
-        float maxval = -FLT_MAX, minval = FLT_MAX;
-        for (size_t i = 0; i < y; i++)
-            for (size_t j = 0; j < x; j++) {
-                float val = dataPtr[(i*x + j) * stride];
-                if (val > maxval) maxval = val;
-                if (val < minval) minval = val;
-                imgbuf[i*x + j] = val;
-            }
-
-        if (minval >= 0.f && maxval <= 0.f) {
-            layer_bmp_log << img_name << " all zero." << std::endl;
-        } else {
-            const float mult = 256.f / (maxval - minval);
-            std::vector<unsigned char> bmpbuf(img_sz * 3);
-            unsigned char* bmp_ptr = bmpbuf.data();
-
-            for (int i = 0; i < imgbuf.size(); i++, bmp_ptr += 3) {
-                if (imgbuf[i] >= 0.f && imgbuf[i] <= 0.f) {
-                    bmp_ptr[0] = 65;
-                    bmp_ptr[1] = bmp_ptr[2] = 0;
-                } else {
-                    bmp_ptr[0] = bmp_ptr[1] = bmp_ptr[2] = (unsigned char)((imgbuf[i] - minval) * mult);
-                }
-            }
-
-            //  write bmp file
-            std::string full_name = dir_name + img_name;
-            stbi_write_bmp(full_name.c_str(), x, y, 3, (const void *)bmpbuf.data());
-        }
-
-        switch (format) {
-        case mkldnn::impl::memory_format::nChw8c:
-            if ( ( img & 7 ) < 7 )   dataPtr++;
-            else                dataPtr += img_sz * 8;
-            break;
-        case mkldnn::impl::memory_format::nChw16c:
-            if ( ( img & 15 ) < 15 )    dataPtr++;
-            else                    dataPtr += img_sz * 16;
-            break;
-        case mkldnn::impl::memory_format::nchw:
-        default:
-            dataPtr += img_sz;
-            break;
-        }
+        ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
     }
-
-    layer_bmp_log.close();
 }
-#endif
 
 void MKLDNNGraph::Infer(int batch) {
     if (!IsReady()) {
@@ -894,175 +717,20 @@ void MKLDNNGraph::Infer(int batch) {
     }
 
     mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
-#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER
-        static int folderIdx = 0;
-        folderIdx++;
-#endif
     for (int i = 0; i < graphNodes.size(); i++) {
         PERF(graphNodes[i]);
 
         if (batch > 0)
             graphNodes[i]->setDynamicBatchLim(batch);
 
+        ENABLE_DUMP(do_before(DUMP_DIR, graphNodes[i]));
+
         if (!graphNodes[i]->isConstant()) {
             IE_PROFILING_AUTO_SCOPE_TASK(graphNodes[i]->profilingTask)
             graphNodes[i]->execute(stream);
         }
 
-#ifdef DEBUG_DUMP_PATH
-        {
-            auto folderName = std::string(DEBUG_DUMP_PATH) +
-#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER
-            std::to_string(folderIdx - 1) +
-#endif
-            "/";
-            std::cout << "Try to create logs for " << graphNodes[i]->getName() << std::endl;
-            std::string nodeName = graphNodes[i]->name;
-            std::replace(nodeName.begin(), nodeName.end(), '/', '_');
-            std::ofstream layer_data_dump;
-            for (size_t j = 0; j < graphNodes[i]->getChildEdges().size(); j++) {
-                auto childEdge = graphNodes[i]->getChildEdgeAt(j);
-                std::string childName = graphNodes[i]->getChildEdgeAt(j)->getChild()->getName();
-                std::replace(childName.begin(), childName.end(), '/', '_');
-
-                //  std::string fname = DEBUG_DUMP_PATH + nodeName + "_dst_" + childName + "_" + std::to_string(j) + ".txt";
-                std::string tname = folderName + nodeName + "_dst_" + childName + "_" + std::to_string(j);
-                std::string fname = tname + ".txt";
-                if (graphNodes[i]->getChildEdges().size() == 1) {
-                    fname = folderName + nodeName + "_dst.txt";
-                }
-                layer_data_dump.open(fname);
-                if (layer_data_dump.is_open()) {
-                    float *data = static_cast<float *>(childEdge->getMemory().GetData());
-                    mkldnn::impl::memory_desc_wrapper dst_d(childEdge->getMemory().GetDescriptor().data);
-    #ifdef DEBUG_BMP_OUTPUT
-                    dump_as_bitmaps(tname, data, childEdge->getDims().ToSizeVector(), dst_d.format());
-    #endif
-
-                    layer_data_dump << "shape: ";
-                    for (size_t d = 0; d < childEdge->getDims().ndims(); d++)
-                        layer_data_dump << childEdge->getDims()[d] << " ";
-                    layer_data_dump << "(" << dst_d.nelems() << ")" << std::endl;
-                    if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::FP32) {
-                        float *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << std::fixed << std::setprecision(3) << data[dst_d.off_l(bs)] << std::endl;
-                    }
-                    } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I8) {
-                        int8_t *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::U8) {
-                        uint8_t *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I32) {
-                        int32_t *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
-                        }
-                    }
-
-                    layer_data_dump.close();
-                } else {
-                    std::cout << "Cannot create file " << fname << std::endl;
-                }
-            }
-
-            for (size_t p = 0 ; p < graphNodes[i]->getParentEdges().size(); p++) {
-                auto parentEdge = graphNodes[i]->getParentEdgeAt(p);
-                auto parent = parentEdge->getParent();
-                std::string parentName = parent->getName();
-                std::replace(parentName.begin(), parentName.end(), '/', '_');
-                //  std::string fname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p) + ".txt";
-                std::string tname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p);
-                std::string fname = tname + ".txt";
-                layer_data_dump.open(fname);
-                if (layer_data_dump.is_open()) {
-                    size_t dataSize = graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetSize();
-                    mkldnn::impl::memory_desc_wrapper src_d(graphNodes[i]->getParentEdges()[p]
-                                                                    .lock()->getMemory().GetDescriptor().data);
-    #ifdef DEBUG_BMP_OUTPUT
-                    dump_as_bitmaps(tname, data, parentEdge->getDims().ToSizeVector(), src_d.format());
-    #endif
-                    layer_data_dump << "shape: ";
-                    for (size_t d = 0; d < parentEdge->getDims().ndims(); d++)
-                        layer_data_dump << parentEdge->getDims()[d] << " ";
-                    layer_data_dump << "(" << src_d.nelems() << ")"<< std::endl;
-                    auto precision = graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision();
-                    if (precision == Precision::FP32) {
-                        float *data = static_cast<float *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << std::fixed << std::setprecision(3) << data[src_d.off_l(bs)] << std::endl;
-                        }
-                    } else if (precision == Precision::I8) {
-                        int8_t *data = static_cast<int8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::U8) {
-                        uint8_t *data = static_cast<uint8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::I32) {
-                        int32_t *data = static_cast<int32_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
-                        }
-                    } else {
-                        layer_data_dump << "Unsupported precision: " << precision.name() << std::endl;
-                    }
-
-                    layer_data_dump.close();
-                } else {
-                    std::cout << "Cannot create file " << fname << std::endl;
-                }
-            }
-
-            GenericLayer* genericLayer = dynamic_cast<GenericLayer*>(graphNodes[i]->getCnnLayer().get());
-            if (genericLayer != nullptr) {
-                for (auto blob : genericLayer->blobs) {
-                    layer_data_dump.open(folderName + nodeName + "_blob-" + blob.first + ".txt");
-                    if (layer_data_dump.is_open()) {
-                        layer_data_dump << "shape: ";
-                        for (size_t d = 0; d < blob.second->dims().size(); d++)
-                            layer_data_dump << blob.second->dims()[d] << " ";
-                        layer_data_dump << "(" << blob.second->size() << ")"<< std::endl;
-                        if (blob.second->getTensorDesc().getPrecision() == Precision::FP32) {
-                        float *data = blob.second->buffer();
-                        for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                            layer_data_dump << std::fixed << std::setprecision(3) << data[bs] << std::endl;
-                        }
-                        } else if (blob.second->getTensorDesc().getPrecision() == Precision::I8) {
-                            int8_t *data = blob.second->buffer();
-                            for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                                layer_data_dump << static_cast<int>(data[bs]) << std::endl;
-                            }
-                        } else if (blob.second->getTensorDesc().getPrecision() == Precision::U8) {
-                            uint8_t *data = blob.second->buffer();
-                            for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                                layer_data_dump << static_cast<int>(data[bs]) << std::endl;
-                            }
-                        } else if (blob.second->getTensorDesc().getPrecision() == Precision::I32) {
-                            int32_t *data = blob.second->buffer();
-                            for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                                layer_data_dump << static_cast<int>(data[bs]) << std::endl;
-                            }
-                        } else {
-                            layer_data_dump << "Unsupported precision: " << blob.second->getTensorDesc().getPrecision().name() << std::endl;
-                        }
-                        layer_data_dump.close();
-                    } else {
-                        std::cout << "Cannot create file " << folderName << nodeName
-                                  << "_" << blob.first << ".txt" << std::endl;
-                    }
-                }
-            }
-        }
-#endif
+        ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
     }
 }
 
@@ -1153,6 +821,8 @@ void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEn
     for (int i = 1; i < graphNodes.size(); i++) {
         getPerfMapFor(perfMap, graphNodes[i]);
     }
+
+    if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot");
 }
 
 void MKLDNNGraph::setConfig(const Config &cfg) {
@@ -1257,7 +927,56 @@ void MKLDNNGraph::RemoveDroppedEdges() {
     }
 }
 
-bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const {
+void MKLDNNGraph::dumpToDotFile(std::string file) const {
+    std::ofstream dot;
+    dot.open(file);
+    if (!dot.is_open()) THROW_IE_EXCEPTION << "CPU Plugin cannot create dot file " << file << ".";
+
+    dump_graph_as_dot(*this, dot);
+    dot.close();
+}
+
+void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) {
+    auto exec_order = std::to_string(node->execIndex);
+    std::string nodeName = node->name;
+    std::replace(nodeName.begin(), nodeName.end(), '/', '_');
+
+    auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size();
+    for (size_t i = 0; i < num_ports; i++) {
+        auto prEdge = node->getParentEdgeAt(i);
+        auto pr = prEdge->getParent();
+
+        auto dump_file = dir + "/#" + exec_order + "_" +  nodeName + "_in" + std::to_string(i) + ".ieb";
+        TensorDesc desc = prEdge->getDesc();
+        Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData());
+
+        BlobDumper dumper(blob);
+        if (pr->ext_scales) dumper.withScales(pr->ext_scales);
+        dumper.dump(dump_file);
+    }
+}
+
+void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) {
+    auto exec_order = std::to_string(node->execIndex);
+    auto nodeName = node->name;
+    std::replace(nodeName.begin(), nodeName.end(), '/', '_');
+
+    auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size();
+    for (size_t i = 0; i < num_ports; i++) {
+        auto childEdge = node->getChildEdgeAt(i);
+
+        auto dump_file = dir + "/#" + exec_order + "_" +  nodeName + "_out" + std::to_string(i) + ".ieb";
+        TensorDesc desc = childEdge->getDesc();
+        Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData());
+
+        BlobDumper dumper(blob);
+        if (node->ext_scales) dumper.withScales(node->ext_scales);
+
+        dumper.dump(dump_file);
+    }
+}
+
+bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const {
     InputsDataMap inputs;
     network.getInputsInfo(inputs);
 
@@ -1274,6 +993,11 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
     bool check_result = true;
     details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) {
         auto type = TypeFromName(layer->type);
+        // This is WA for Tile layer
+        auto tileLayer = dynamic_cast<TileLayer *>(layer.get());
+        if (tileLayer && tileLayer->axis)
+            return;
+
         if (type != Input &&
             type != Output &&
             type != Convolution &&
@@ -1283,6 +1007,7 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
             type != Lrn &&
             type != Pooling &&
             type != FullyConnected &&
+            type != Gemm &&
             type != SoftMax &&
             type != Split &&
             type != Concatenation &&
@@ -1301,55 +1026,87 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
 InferenceEngine::InferRequestInternal::Ptr
 MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                           InferenceEngine::OutputsDataMap networkOutputs) {
-    return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
+    if (graphs.size() > 1)  // streams uses special requests that are not connected to graphs
+        return std::make_shared<MKLDNNGraphlessInferRequest>(networkInputs, networkOutputs);
+    else
+        return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
 }
 
-MKLDNNExecNetwork::MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network,
+MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
                                      const Config &cfg,
                                      const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) {
-    graph.reset(new MKLDNNGraph());
-    graph->setConfig(cfg);
+    ICNNNetworkStats* pstats = nullptr;
+    StatusCode s = network.getStats(&pstats, nullptr);
+    // we are cloning network if we have statistics and we can transform network
+    // in other case we pass original network. Especially because LSTM networks
+    // are not cloned properly
+    details::CNNNetworkImplPtr clonedNetwork;
+    if (s == StatusCode::OK && pstats && !pstats->isEmpty()) {
+        CNNNetworkInt8Normalizer cnnorm;
+        clonedNetwork = cloneNet(network);
+        cnnorm.NormalizeNetwork(*clonedNetwork, *pstats);
+    }
+    bool ti_proc_ok = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true;
+    if (!ti_proc_ok)
+        THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. "
+                              "None TI optimization pattern has been applied successfully";
+
 
     if (cfg.batchLimit > 1) {
         // check topology for applicability
-        if (!CanProcessDynBatch(network)) {
+        if (!CanProcessDynBatch(clonedNetwork ? *clonedNetwork : network)) {
             THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
         }
     }
+    // check whether any (affinity-related) envs are set and if user requested thread binding
+    const bool bPinningRequested = !check_env_variables() && cfg.useThreadBinding;
+    // general #threads logic
+    const int env_threads = parallel_get_env_threads();
+    // for streams need all (logical) cores, while single-stream case just physical cores (better for servers), as usual
+    const int hw_cores = cfg.throughputStreams > 1 ? parallel_get_max_threads() : getNumberOfCPUCores();
+    const int threads = cfg.threadsNum ? cfg.threadsNum : (env_threads ? env_threads : hw_cores);
+    const int threads_per_stream = std::max(1, threads/cfg.throughputStreams);
+
+    // graph(s) initialization in taskExecutor threads (streams), in parallel (in case of streams)
+    std::vector<Task::Ptr> tasks;
+
+    for (int n = 0; n < cfg.throughputStreams; n++) {
+        MKLDNNGraph::Ptr _graph = std::make_shared<MKLDNNGraph>();
+        graphs.push_back(_graph);
+        auto task = std::make_shared<InferenceEngine::Task>([=, &cfg, &network]() {
+            _graph->CreateArena(threads_per_stream);
+
+            if (bPinningRequested) {
+                _graph->CreateObserver(n, threads_per_stream);
+            }
 
-    if (graph->getProperty().exclusiveAsyncRequests) {
-        ExecutorManager *executorManager = ExecutorManager::getInstance();
-        _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
+            _graph->setConfig(cfg);
+            _graph->CreateGraph(clonedNetwork ? *clonedNetwork : network, extensionManager);
+            if (cfg.throughputStreams > 1)  // for streams, each worker thread has it's own graph
+                MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph;
+        });
+        tasks.push_back(task);
     }
 
-    // initialization in taskExecutor thread
-    auto task = std::make_shared<InferenceEngine::Task>([&]() {
-        // we are cloning network if we have statistics and we can transform network
-        // in other case we pass original network. Especially because LSTM networks
-        // are not cloned properly
-        ICNNNetworkStats* pstats = nullptr;
-        StatusCode s = network.getStats(&pstats, nullptr);
-        Xbyak::util::Cpu cpu;
-        // Enable int8 only for avx512
-        if (s == StatusCode::OK && pstats && !pstats->isEmpty() && cpu.has(Xbyak::util::Cpu::tAVX512F)) {
-            details::CNNNetworkImplPtr clonnedNetwork = cloneNet(network);
-            CNNNetworkInt8Normalizer cnnorm;
-            cnnorm.NormalizeNetwork(*clonnedNetwork, *pstats);
-            graph->CreateGraph(*clonnedNetwork, extensionManager);
-        } else {
-            graph->CreateGraph(network, extensionManager);
+    if (cfg.throughputStreams > 1) {
+        // special executor with as many threads as requested #streams, each with it's own initialization task
+        _taskExecutor = std::make_shared<MultiWorkerTaskExecutor>(tasks);
+    } else {
+        if (cfg.exclusiveAsyncRequests) {
+            // special case when all InferRequests are muxed into a single queue
+            ExecutorManager *executorManager = ExecutorManager::getInstance();
+            _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
         }
-    });
-
-    _taskExecutor->startTask(task);
-    Task::Status sts = task->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
-
-    if (sts == Task::TS_ERROR) task->checkException();
+        _taskExecutor->startTask(tasks[0]);
+        Task::Status sts = tasks[0]->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+    }
+    for (auto t : tasks)
+        t->checkException();
 }
 
 void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
-    if (graph)  // TODO: graph field cannot be empty
-        graph->setProperty(properties);
+    for (auto g : graphs)
+        g->setProperty(properties);
 }
 
 void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
@@ -1362,13 +1119,10 @@ void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &
 
     asyncRequestImpl->SetPointerToPublicInterface(asyncRequest);
 
-    auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
-    if (!mkldnnSyncRequest)
-        THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
-    mkldnnSyncRequest->SetGraph(graph);
-}
-
-MKLDNNExecNetwork::~MKLDNNExecNetwork() {
-    graph.reset();
-    extensionManager.reset();
+    if (graphs.size() == 1) {  // single-stream (legacy/hetero) case - single graph for all requests
+        auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
+        if (!mkldnnSyncRequest)
+            THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
+        mkldnnSyncRequest->SetGraph(graphs[0]);
+    }
 }
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
index d1fdb0fe9..de026b5ad 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <memory>
 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
 
+#include "ie_parallel.hpp"
 #include "mkldnn_memory.h"
 #include "config.h"
 #include "perf_count.h"
@@ -19,6 +19,7 @@
 #include "mkldnn_node.h"
 #include "mkldnn_edge.h"
 #include "mkldnn_extension_utils.h"
+#include "mkldnn_streams.h"
 
 namespace MKLDNNPlugin {
 
@@ -48,7 +49,7 @@ public:
     void getInputBlobs(InferenceEngine::BlobMap &in_map);
     void getOutputBlobs(InferenceEngine::BlobMap &out_map);
 
-    void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
+    void CreateGraph(const InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
 
     bool hasMeanImageFor(const std::string& name) {
         return _meanImages.find(name) != _meanImages.end();
@@ -81,6 +82,35 @@ public:
     void RemoveDroppedEdges();
     void DropNode(const MKLDNNNodePtr& node);
 
+    void CreateArena(int threads_per_stream) {
+        #if IE_THREAD == IE_THREAD_OMP
+        omp_set_num_threads(threads_per_stream);
+        #elif IE_THREAD == IE_THREAD_TBB
+        ptrArena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena(threads_per_stream));
+        #endif
+    }
+
+    void CreateObserver(int _stream_id, int _threads_per_stream, int _pinning_step = 1) {
+        #if IE_THREAD == IE_THREAD_TBB
+        ptrObserver
+                = std::unique_ptr<tbb::task_scheduler_observer>(
+                new pinning_observer(*ptrArena.get(), _stream_id, _threads_per_stream, _pinning_step));
+        #else
+        cpu_set_t *process_mask = nullptr;
+        int ncpus = 0;
+        get_process_mask(ncpus, process_mask);
+            #if IE_THREAD == IE_THREAD_OMP
+            #pragma omp parallel for
+                    for (int thread_index = 0; thread_index < _threads_per_stream; thread_index++) {
+                        pin_thread_to_vacant_core(_stream_id * _threads_per_stream + thread_index, 1, ncpus, process_mask);
+                    }
+            #elif IE_THREAD == IE_THREAD_SEQ
+            pin_thread_to_vacant_core(_stream_id * _threads_per_stream, 1, ncpus, process_mask);
+            #endif
+        CPU_FREE(process_mask);
+        #endif
+    }
+
 protected:
     MKLDNNNodePtr FindNodeWithName(const std::string& name) const;
     void VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes);
@@ -108,6 +138,10 @@ protected:
 
     std::map<std::string, MeanImage> _meanImages;
 
+    #if IE_THREAD == IE_THREAD_TBB
+    std::unique_ptr<tbb::task_arena> ptrArena;
+    std::unique_ptr<tbb::task_scheduler_observer> ptrObserver;
+    #endif
     mkldnn::engine eng;
 
     void InitNodes();
@@ -116,13 +150,15 @@ protected:
     void AllocateWithReuse();
     void CreatePrimitives();
 
-    void BreakEdgeInsertScaleShift(MKLDNNPlugin::MKLDNNEdgePtr edgeToBreak,
-                                   InferenceEngine::CNNLayerPtr ssCnnLayer);
-    void AddScaleShiftBeforeAndAfterInt8(InferenceEngine::CNNNetwork& net);
+    void do_before(const std::string &dir, const MKLDNNNodePtr &node);
+    void do_after(const std::string &dir, const MKLDNNNodePtr &node);
 
     friend class MKLDNNInferRequest;
+    friend class MKLDNNGraphlessInferRequest;
+    friend std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph);
 
 private:
+    void dumpToDotFile(std::string file) const;
     struct ParsedLayer {
         MKLDNNNodePtr parent;
         InferenceEngine::CNNLayerPtr cnnLayer;
@@ -142,18 +178,21 @@ public:
 
     void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override;
 
-    MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network, const Config &cfg,
+    MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg,
                       const MKLDNNExtensionManager::Ptr& extMgr);
 
-    ~MKLDNNExecNetwork() override;
+    ~MKLDNNExecNetwork() {
+        graphs.clear();
+        extensionManager.reset();
+    }
 
     void setProperty(const std::map<std::string, std::string> &properties);
 
 protected:
-    MKLDNNGraph::Ptr graph;
+    std::vector<MKLDNNGraph::Ptr> graphs;
     MKLDNNExtensionManager::Ptr extensionManager;
 
-    bool CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const;
+    bool CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
new file mode 100644
index 000000000..ae24579f6
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
@@ -0,0 +1,207 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_graph_dumper.h"
+#include "cnn_network_impl.hpp"
+#include "ie_util_internal.hpp"
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+static void copy_node_metadata(const MKLDNNNodePtr &, CNNLayer::Ptr &);
+static void drawer_callback(const InferenceEngine::CNNLayerPtr, ordered_properties &, ordered_properties &);
+
+CNNLayer::Ptr convert_node(const MKLDNNNodePtr &node) {
+    CNNLayer::Ptr layer(new CNNLayer({"name", "type", Precision::FP32}));
+    copy_node_metadata(node, layer);
+
+    auto &cfg = node->getSelectedPrimitiveDescriptor()->getConfig();
+    layer->insData.resize(cfg.inConfs.size());
+    layer->outData.resize(cfg.outConfs.size());
+
+    return layer;
+}
+
+std::shared_ptr<ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph) {
+    auto net = std::make_shared<details::CNNNetworkImpl>();
+
+    net->setPrecision(Precision::FP32);
+    net->setName("internal_cpu_graph");
+    std::map<MKLDNNNodePtr, CNNLayerPtr> node2layer;
+
+    // Copy all nodes to network
+    for (auto &node : graph.graphNodes) {
+        auto layer = convert_node(node);
+        node2layer[node] = layer;
+        net->addLayer(layer);
+    }
+
+    // Copy all edges to network
+    for (auto &node : graph.graphNodes) {
+        auto pr = node2layer[node];
+        auto ch_edges = node->getChildEdges();
+
+        for (int i = 0; i < ch_edges.size(); i++) {
+            auto edge = node->getChildEdgeAt(i);
+            int out_port = edge->getInputNum();
+            int in_port = edge->getOutputNum();
+            auto ch_node = edge->getChild();
+            auto ch  = node2layer[ch_node];
+
+            DataPtr data;
+            if (i < pr->outData.size()) {
+                std::string data_name = node->getName() + "_out" + std::to_string(i);
+                pr->outData[i] = std::make_shared<Data>(data_name, edge->getDesc());
+                data = pr->outData[i];
+                data->creatorLayer = pr;
+            } else {
+                data = pr->outData[0];
+            }
+
+            data->inputTo[ch->name] = ch;
+            ch->insData[in_port] = data;
+        }
+    }
+
+    // Specify inputs data
+    for (auto kvp : graph.inputNodes) {
+        auto in_node = kvp.second;
+        auto in_layer = node2layer[in_node];
+
+        auto in_info = std::make_shared<InputInfo>();
+        in_info->setInputData(in_layer->outData[0]);
+        net->setInputInfo(in_info);
+    }
+
+    return net;
+}
+
+void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out) {
+    auto dump_net = dump_graph_as_ie_net(graph);
+    InferenceEngine::saveGraphToDot(*dump_net, out, drawer_callback);
+}
+
+//**********************************
+// Special converters of meta data
+//**********************************
+
+static std::map<Type, std::string> type_n2l {
+    {Unknown, "Unknown"},
+    {Generic, "Unknown"},
+    {Reorder, "Reorder"},
+    {Copy, "Reorder"},
+    {Input, "Input"},
+    {Output, "Output"},
+    {Convolution, "Conv"},
+    {Deconvolution, "Deconv"},
+    {Convolution_Sum, "Conv_Eltw"},
+    {Convolution_Activation, "Conv_Activ"},
+    {Convolution_Sum_Activation, "Conv_Eltw_Activ"},
+    {Activation, "Activation"},
+    {Depthwise, "Depthwise"},
+    {Lrn, "Lrn"},
+    {Pooling, "Pool"},
+    {FullyConnected, "FC"},
+    {SoftMax, "SoftMax"},
+    {Split, "Split"},
+    {Concatenation, "Concat"},
+    {Power, "Power"},
+    {Eltwise, "Eltwise"},
+    {Crop, "Crop"},
+    {Reshape, "Reshape"},
+    {Tile, "Tile"},
+    {SimplerNMS, "Proposal"},
+    {ROIPooling, "ROIPooling"},
+    {BatchNormalization, "BatchNorm"},
+    {Flatten, "Flatten"},
+    {Permute, "Permute"},
+    {MemoryOutput, "MemoryIn"},
+    {MemoryInput, "MemoryOut"}
+};
+
+static const std::string ORIGIN_NAMES = "origin";
+static const std::string IMPL_TYPE    = "impl";
+static const std::string PRECISION    = "prec";
+static const std::string PERF_COUNTER = "perf";
+
+static const std::string BLUE  = "#D8D9F1";
+static const std::string GREEN = "#D9EAD3";
+
+void copy_node_metadata(const MKLDNNNodePtr &node, CNNLayer::Ptr &layer) {
+    layer->type = type_n2l[node->getType()];
+    layer->name = node->getName();  // Is ID
+
+    if (node->getCnnLayer()) {
+        // Original layer names
+        std::vector<MKLDNNNodePtr> internal = node->getFusedWith();
+        auto &merged = node->getMergeWith();
+        internal.insert(internal.end(), merged.begin(), merged.end());
+
+        std::string orig_names = node->getCnnLayer()->name;
+        for (auto &sub_node : internal)
+            orig_names += " " + sub_node->getCnnLayer()->name;
+
+        layer->params[ORIGIN_NAMES] = orig_names;
+    }
+
+    // Implementation type name
+    layer->params[IMPL_TYPE] = node->getPrimitiveDescriptorType();
+
+    // Precision
+    // TODO: That is not fully correct mapping type to precision.
+    std::string precision = "FP32";
+    auto desc = node->getSelectedPrimitiveDescriptor();
+    if (desc == nullptr) {
+        THROW_IE_EXCEPTION << "Internal error - descriptor is empty";
+    }
+    impl_desc_type impl_type = desc->getImplementationType();
+
+    if (impl_type == gemm_blas &&
+        node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8)  precision = "INT8";
+
+    if (impl_type & jit && impl_type & avx512 &&
+        node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8)  precision = "INT8";
+
+    layer->params[PRECISION] = precision;
+
+    // Performance
+    if (node->PerfCounter().avg() != 0) {
+        layer->params[PERF_COUNTER] = std::to_string(node->PerfCounter().avg())+ " mcs";
+    }
+}
+
+void drawer_callback(const InferenceEngine::CNNLayerPtr layer,
+        ordered_properties &printed_properties,
+        ordered_properties &node_properties) {
+    const auto &params = layer->params;
+
+    // Implementation
+    auto impl = params.find(IMPL_TYPE);
+    if (impl != params.end()) {
+        printed_properties.push_back({"impl", impl->second});
+    }
+
+    // Original names
+    auto orig = params.find(ORIGIN_NAMES);
+    if (orig != params.end()) {
+        printed_properties.push_back({"originals", orig->second});
+    }
+
+    // Precision
+    auto prec = params.find(PRECISION);
+    if (prec != params.end()) {
+        printed_properties.push_back({"precision", prec->second});
+    }
+
+    // Set color
+    node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE});
+}
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
new file mode 100644
index 000000000..6ec5ffc45
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
@@ -0,0 +1,18 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_icnn_network.hpp"
+#include "mkldnn_graph.h"
+
+#include <memory>
+
+namespace MKLDNNPlugin {
+
+    void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out);
+
+    std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph);
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
index 3be1fbf23..6c88ebd6f 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -144,20 +143,27 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
+    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+        for (auto a : algs) {
+            if (alg == a) {
+                return true;
+            }
+        }
+        return false;
+    };
+
     auto& graphNodes = graph.GetNodes();
 
-    auto isFusingSupported = [&](MKLDNNNodePtr node) {
-        if (!node->getCnnLayer())
+    auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+        if (!activation->getCnnLayer())
             return false;
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
+        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
 
         return activationNode &&
-               (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu           ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu            ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic       ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu   ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp);
+            (activationNode->getAlgorithm() == eltwise_relu ||
+            (conv->getCnnLayer()->precision == Precision::FP32 &&
+             isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
     };
 
     for (int i = 0; i < graphNodes.size(); i++) {
@@ -172,13 +178,13 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
             if (conv->getChildEdges().size() == 1) {
                 auto ch1 = conv->getChildEdgeAt(0)->getChild();
 
-                if (isFusingSupported(ch1)) {
+                if (isFusingSupported(conv, ch1)) {
                     fuse(ch1);
 
                     if (ch1->getChildEdges().size() == 1) {
                         auto ch2 = ch1->getChildEdgeAt(0)->getChild();
 
-                        if (isFusingSupported(ch2)) {
+                        if (isFusingSupported(conv, ch2)) {
                             fuse(ch2);
                             graph.DropNode(ch2);
                         }
@@ -193,7 +199,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
 
                         if (is_max_pool && pool->getChildEdges().size() == 1) {
                             auto ch2 = pool->getChildEdgeAt(0)->getChild();
-                            if (isFusingSupported(ch2)) {
+                            if (isFusingSupported(conv, ch2)) {
                                 fuse(ch2);
                                 graph.DropNode(ch2);
                             }
@@ -274,8 +280,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
 
     auto isSutableChildConvolution = [](MKLDNNNodePtr node) {
         auto* layer = dynamic_cast<ConvolutionLayer*>(node->getCnnLayer().get());
-        auto allPads = getConvPaddings(*layer);
+        auto allPads = getPaddings(*layer);
         bool isSupportedParams = layer->_out_depth == layer->_group &&
+
+                                 layer->_out_depth != 1 &&
+                                 // Depthwise convolution output should be multiple of 8
+
                                  layer->_kernel[X_AXIS] == 3 && layer->_kernel[Y_AXIS] == 3 &&
                                  allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 &&
                                  layer->_dilation[X_AXIS] == 1 && layer->_dilation[Y_AXIS] == 1 &&
@@ -379,18 +389,25 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
 void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
     std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
 
-    auto isFusingSupported = [&](MKLDNNNodePtr node) {
-        if (!node->getCnnLayer())
+    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+        for (auto a : algs) {
+            if (alg == a) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+        if (!activation->getCnnLayer())
             return false;
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
+        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
 
         return activationNode &&
-               (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu           ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu            ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic       ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu   ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp);
+            (activationNode->getAlgorithm() == eltwise_relu ||
+            (conv->getCnnLayer()->precision == Precision::FP32 &&
+             isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
     };
 
     for (auto &graphNode : graphNodes) {
@@ -411,6 +428,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
 
         auto mergedConv = (parent1->getType() == Convolution) ? parent1 : parent2;
         auto peerNode = (parent1->getType() == Convolution) ? parent2 : parent1;
+        if (peerNode->getType() == Convolution && mergedConv->getChildEdges().size() != 1) {
+            mergedConv = parent2;
+            peerNode = parent1;
+        }
         auto sum = graphNode;
         auto lastNode = sum;
 
@@ -431,7 +452,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
         if (!fuse_allowed) continue;
 
         if (graphNode->getChildEdges().size() == 1 &&
-                isFusingSupported(graphNode->getChildEdgeAt(0)->getChild())) {
+                isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) {
             auto relu_shared = graphNode->getChildEdgeAt(0)->getChild();
             lastNode = relu_shared;
             mergedConv->setType(Convolution_Sum_Activation);
@@ -472,29 +493,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
     }
 }
 
-/**
- *  Convert LSTM layer format with combined state blob
- */
-void MKLDNNGraphOptimizer::SLTMTransform(MKLDNNGraph& graph) {
-    auto &all_nodes = graph.GetNodes();
-
-    for (auto &lstm : all_nodes) {
-        if (lstm->getType() != RNN)
-            continue;
-
-        auto layer = lstm->getCnnLayer();
-        auto in_datas = layer->insData;
-        auto out_datas = layer->outData;
-
-        if (in_datas.size() == 3) {
-            assert(lstm->getParentEdges().size() == 3);
-            // Concatenate 2 states into one blob
-            // TODO: TBD
-        } else if ((in_datas.size() != 1)) {
-            THROW_IE_EXCEPTION << "Unsupported mode for LSTM cell. Expected two state blobs";
-        }
-    }
-}
 
 void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
@@ -520,8 +518,11 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
 
 void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
     std::set<MKLDNNNodePtr> processed;
+    std::vector<MKLDNNNodePtr> newNodes;
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
-        if (processed.find(node) == processed.end() && node->getType() == Reorder && node->getChildEdgeAt(0)->getChild()->getType() == Reorder) {
+        if (processed.find(node) == processed.end() && node->getType() == Reorder
+            && node->getChildEdges().size() == 1
+            && node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) {
             auto nextNode = node->getChildEdgeAt(0)->getChild();
             MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get());
             MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get());
@@ -590,10 +591,13 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
             afterNode->getDesc();
             graph.GetEdges().push_back(afterNode);
 
-            graph.GetNodes().push_back(newReorder);
+            newNodes.push_back(newReorder);
             graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end());
         }
     }
+    for (MKLDNNNodePtr& node : newNodes) {
+        graph.GetNodes().push_back(node);
+    }
 }
 
 void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
@@ -603,7 +607,7 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
 
             auto cur = l->insData[0].lock();
             if (cur == nullptr) {
-                THROW_IE_EXCEPTION << "[MKLDNN] shared_ptr l->insData[0].lock() returned nullptr";
+                THROW_IE_EXCEPTION << "[MKLDNN] error - invalid input data";
             }
             if (cur->precision != l->outData[0]->precision) {
                 if (node->name.find("_iScaleShift_") != std::string::npos) {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
index d6fa323da..6818cc9ae 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
index 338ed7274..95e803925 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@@ -1,10 +1,10 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_infer_request.h"
 #include "mkldnn_extension_utils.h"
+#include "mkldnn_streams.h"
 #include <vector>
 #include <string>
 #include <map>
@@ -36,83 +36,97 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
     if (!graph || !graph->IsReady()) {
         THROW_IE_EXCEPTION << "Network not loaded.";
     }
-
-    // execute input pre-processing.
-    execDataPreprocessing(_inputs);
-
-    changeDefaultPtr();
-    // need to retain converted blobs until infer finish
-    std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
-    for (auto input : _inputs) {
-        if (!_networkInputs[input.first]) {
-            THROW_IE_EXCEPTION <<
-                               "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
-                               << input.first;
-        }
-        /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) {
-            THROW_IE_EXCEPTION << "Different input precision for input " << input.first
-                               << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. "
-                               << _networkInputs[input.first]->getInputPrecision() << " vs "
-                               << input.second->precision();
-        }*/
+    auto infer = [this] {
+        // execute input pre-processing.
+        execDataPreprocessing(_inputs);
+
+        changeDefaultPtr();
+        // need to retain converted blobs until infer finish
+        std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
+        for (auto input : _inputs) {
+            if (!_networkInputs[input.first]) {
+                THROW_IE_EXCEPTION <<
+                                   "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
+                                   << input.first;
+            }
+            /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) {
+                THROW_IE_EXCEPTION << "Different input precision for input " << input.first
+                                   << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. "
+                                   << _networkInputs[input.first]->getInputPrecision() << " vs "
+                                   << input.second->precision();
+            }*/
 
 
 
-        InferenceEngine::Blob::Ptr iconv;
-        InferenceEngine::TBlob<float> *in_f = nullptr;
-        switch (input.second->precision()) {
-            case InferenceEngine::Precision::FP32:
-                pushInput<float>(input.first, input.second);
-                break;
-            case InferenceEngine::Precision::U16:
-                // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
-                iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
-                        InferenceEngine::Precision::FP32,
-                        input.second->getTensorDesc().getLayout(), input.second->dims());
-                convertedInputs.push_back(iconv);
-                iconv->allocate();
-                in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
-                InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
-                pushInput<float>(input.first, iconv);
-                break;
-            case InferenceEngine::Precision::I16:
-                if (graph->hasMeanImageFor(input.first)) {
-                    // If a mean image exists, we convert the blob and send FP32
-                    iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
-                            InferenceEngine::Precision::FP32,
-                            input.second->getTensorDesc().getLayout(), input.second->dims());
-                    convertedInputs.push_back(iconv);
-                    iconv->allocate();
-                    in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
-                    InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
-                    pushInput<float>(input.first, iconv);
-                } else {
-                    // Instead we can send I16 directly
-                    pushInput<int16_t>(input.first, input.second);
-                }
-                break;
-            case InferenceEngine::Precision::U8:
-                if (graph->hasMeanImageFor(input.first)) {
-                    // If a mean image exists, we convert the blob and send FP32
+            InferenceEngine::Blob::Ptr iconv;
+            InferenceEngine::TBlob<float> *in_f = nullptr;
+            switch (input.second->precision()) {
+                case InferenceEngine::Precision::FP32:
+                    pushInput<float>(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::I32:
+                    pushInput<int32_t>(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::I8:
+                    pushInput<int8_t>(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::U16:
+                    // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
                     iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
                             InferenceEngine::Precision::FP32,
                             input.second->getTensorDesc().getLayout(), input.second->dims());
                     convertedInputs.push_back(iconv);
                     iconv->allocate();
                     in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
-                    InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+                    InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
                     pushInput<float>(input.first, iconv);
-                } else {
-                    // Instead we can send I8 directly
-                    pushInput<uint8_t>(input.first, input.second);
-                }
-                break;
-            default:
-                THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+                    break;
+                case InferenceEngine::Precision::I16:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
+                        pushInput<float>(input.first, iconv);
+                    } else {
+                        // Instead we can send I16 directly
+                        pushInput<int16_t>(input.first, input.second);
+                    }
+                    break;
+                case InferenceEngine::Precision::U8:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+                        pushInput<float>(input.first, iconv);
+                    } else {
+                        // Instead we can send I8 directly
+                        pushInput<uint8_t>(input.first, input.second);
+                    }
+                    break;
+                default:
+                    THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+            }
         }
-    }
-    graph->Infer(m_curBatch);
-    graph->PullOutputData(_outputs);
+        graph->Infer(m_curBatch);
+        graph->PullOutputData(_outputs);
+    };
+#if IE_THREAD == IE_THREAD_TBB
+    auto_scope_observing observer(graph->ptrObserver);
+    // a TBB arena is made "this" for Infer call via executing lambda for the arena
+    graph->ptrArena->execute([&] { infer(); });
+#else
+    infer();
+#endif
 }
 
 void MKLDNNPlugin::MKLDNNInferRequest::GetPerformanceCounts(
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
index 313c4e045..6d88bc8d2 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
index ebbd86422..1821b88f8 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -169,8 +168,22 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
         case f::OhIw16o4i:
         case f::OIhw4i16o4i:
             ndims = 4; break;
-        case f::goihw:
+        // DHW
+        case f::ncdhw:
+        case f::ndhwc:
+        case f::nCdhw8c:
+        case f::nCdhw16c:
+        case f::oidhw:
+        case f::OIdhw8i8o:
+        case f::OIdhw16i16o:
+        case f::OIdhw8o8i:
+        case f::OIdhw16o16i:
+        case f::OIdhw8i16o2i:
+        case f::Odhwi8o:
+        case f::Odhwi16o:
+        // Group HW
         case f::hwigo:
+        case f::goihw:
         case f::gOIhw8i8o:
         case f::gOIhw16i16o:
         case f::gOIhw8i16o2i:
@@ -183,6 +196,15 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
         case f::Goihw8g:
         case f::Goihw16g:
             ndims = 5; break;
+        case f::goidhw:
+        case f::gOIdhw8i8o:
+        case f::gOIdhw16i16o:
+        case f::gOIdhw8i16o2i:
+        case f::gOdhwi8o:
+        case f::gOdhwi16o:
+        case f::gOIdhw8o8i:
+        case f::gOIdhw16o16i:
+            ndims = 6; break;
         case f::format_undef:
             ndims = 0; break;
         case f::any:
@@ -197,8 +219,8 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
 }
 
 bool MKLDNNMemory::IsPlainFormat(memory::format format) {
-    std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::nhwc, memory::chwn,
-        memory::oi, memory::io, memory::oihw, memory::ihwo,
+    std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::ncdhw, memory::nhwc, memory::ndhwc, memory::chwn,
+        memory::oi, memory::io, memory::oihw, memory::oidhw, memory::ihwo,
         memory::goihw,
         memory::blocked};
 
@@ -217,13 +239,28 @@ memory::format MKLDNNMemory::GetPlainFormat(memory::dims dims) {
             return memory::x;
         case 2:
             return memory::nc;
+        case 3:
+            return memory::tnc;
         case 4:
             return memory::nchw;
+        case 5:
+            return memory::ncdhw;
         default:
             return memory::blocked;
     }
 }
 
+InferenceEngine::Layout MKLDNNMemory::GetPlainLayout(memory::dims dims) {
+    switch (dims.size()) {
+        case 1: return Layout::C;
+        case 2: return Layout::NC;
+        case 3: return Layout::CHW;
+        case 4: return Layout::NCHW;
+        default:
+            return Layout::BLOCKED;
+    }
+}
+
 void MKLDNNMemory::CreateBlockingDesc(memory::desc &desc) {
     auto dims = desc.data.dims;
     int ndims = desc.data.ndims;
@@ -262,6 +299,10 @@ memory::format MKLDNNMemory::Convert(const InferenceEngine::Layout layout) {
             return memory::nchw;
         case NHWC:
             return memory::nhwc;
+        case NCDHW:
+            return memory::ncdhw;
+        case NDHWC:
+            return memory::ndhwc;
         case CHW:
             return memory::tnc;
         case NC:
@@ -294,6 +335,11 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
         case memory::nChw8c: return "nChw8c";
         case memory::nChw16c: return "nChw16c";
 
+        case memory::ncdhw: return "ncdhw";
+        case memory::ndhwc: return "ndhwc";
+        case memory::nCdhw8c: return "nCdhw8c";
+        case memory::nCdhw16c: return "nCdhw16c";
+
         case memory::oihw: return "oihw";
         case memory::ihwo: return "ihwo";
         case memory::OIhw8i8o: return "OIhw8i8o";
@@ -306,8 +352,18 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
         case memory::Ohwi16o: return "Ohwi16o";
         case memory::OhIw16o4i: return "OhIw16o4i";
 
+        case memory::oidhw: return "oidhw";
+        case memory::OIdhw8i8o: return "OIdhw8i8o";
+        case memory::OIdhw16i16o: return "OIdhw16i16o";
+        case memory::OIdhw8o8i: return "OIdhw8o8i";
+        case memory::OIdhw16o16i: return "OIdhw16o16i";
+        case memory::OIdhw8i16o2i: return "OIdhw8i16o2i";
+        case memory::Odhwi8o: return "Odhwi8o";
+        case memory::Odhwi16o: return "Odhwi16o";
+
         case memory::goihw: return "goihw";
         case memory::hwigo: return "hwigo";
+        case memory::hwio: return "hwio";
         case memory::gOIhw8i8o: return "gOIhw8i8o";
         case memory::gOIhw16i16o: return "gOIhw16i16o";
         case memory::gOIhw8i16o2i: return "gOIhw8i16o2i";
@@ -317,6 +373,16 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
         case memory::gOIhw8o8i: return "gOIhw8o8i";
         case memory::gOIhw16o16i: return "gOIhw16o16i";
         case memory::gOhIw16o4i: return "gOhIw16o4i";
+
+        case memory::goidhw: return "goidhw";
+        case memory::gOIdhw8i8o: return "gOIdhw8i8o";
+        case memory::gOIdhw16i16o: return "gOIdhw16i16o";
+        case memory::gOIdhw8i16o2i: return "gOIdhw8i16o2i";
+        case memory::gOdhwi8o: return "gOdhwi8o";
+        case memory::gOdhwi16o: return "gOdhwi16o";
+        case memory::gOIdhw8o8i: return "gOIdhw8o8i";
+        case memory::gOIdhw16o16i: return "gOIdhw16o16i";
+
         default: {
             THROW_IE_EXCEPTION << "Unknown data format.";
         }
@@ -400,66 +466,96 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
     auto blkInfo = desc.data.layout_desc.blocking;
     auto offset = static_cast<size_t>(blkInfo.offset_padding);
     SizeVector offsetsForDims;
+    SizeVector dims = getDims().ToSizeVector();
     switch (getFormat()) {
         case memory::format_undef:
             THROW_IE_EXCEPTION << "Cannot cast to tensor desc. Format is undefined!";
         case memory::any:
             layout = Layout::ANY;
-            return TensorDesc(precision, getDims().ToSizeVector(), layout);
+            return TensorDesc(precision, dims, layout);
         case memory::x:
             layout = Layout::C;
             order = {0};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             break;
         case memory::oi:
         case memory::nc:
             layout = Layout::NC;
             order = {0, 1};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             break;
         case memory::tnc:
             layout = Layout::CHW;
             order = {0, 1, 2};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             break;
         case memory::ntc:
             layout = Layout::CHW;
             order = {1, 0, 2};
-            blkDims = {static_cast<size_t>(getDims()[1]),
-                       static_cast<size_t>(getDims()[0]),
-                       static_cast<size_t>(getDims()[2])};
+            blkDims = {static_cast<size_t>(dims[1]),
+                       static_cast<size_t>(dims[0]),
+                       static_cast<size_t>(dims[2])};
             break;
         case memory::oihw:
         case memory::nchw:
             layout = Layout::NCHW;
             order = {0, 1, 2, 3};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
+            break;
+        case memory::ncdhw:
+            layout = Layout::NCDHW;
+            order = {0, 1, 2, 3, 4};
+            blkDims = dims;
             break;
         case memory::nhwc:
             layout = Layout::NHWC;
             order = {0, 2, 3, 1};
-            blkDims = {static_cast<size_t>(getDims()[0]),
-                       static_cast<size_t>(getDims()[2]),
-                       static_cast<size_t>(getDims()[3]),
-                       static_cast<size_t>(getDims()[1])};
+            blkDims = {static_cast<size_t>(dims[0]),
+                       static_cast<size_t>(dims[2]),
+                       static_cast<size_t>(dims[3]),
+                       static_cast<size_t>(dims[1])};
             break;
+        case memory::ndhwc:
+            layout = Layout::NDHWC;
+            order = {0, 2, 3, 4, 1};
+            blkDims = {static_cast<size_t>(dims[0]),
+                       static_cast<size_t>(dims[2]),
+                       static_cast<size_t>(dims[3]),
+                       static_cast<size_t>(dims[4]),
+                       static_cast<size_t>(dims[1])};
+            break;
+        case memory::oIhw8i:
         case memory::nChw8c:
             order = {0, 1, 2, 3, 1};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
+            blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0);
+            blkDims.push_back(8);
+            layout = Layout::BLOCKED;
+            break;
+        case memory::nCdhw8c:
+            order = {0, 1, 2, 3, 4, 1};
+            blkDims = dims;
             blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0);
             blkDims.push_back(8);
             layout = Layout::BLOCKED;
             break;
         case memory::nChw16c:
             order = {0, 1, 2, 3, 1};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
+            blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0);
+            blkDims.push_back(16);
+            layout = Layout::BLOCKED;
+            break;
+        case memory::nCdhw16c:
+            order = {0, 1, 2, 3, 4, 1};
+            blkDims = dims;
             blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0);
             blkDims.push_back(16);
             layout = Layout::BLOCKED;
             break;
         case memory::blocked:
             order.clear();
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             for (size_t i = 0; i < blkDims.size(); i++) {
                 order.push_back(i);
                 if ((i && blkInfo.strides[0][i - 1] < blkInfo.strides[0][i]) || blkInfo.block_dims[i] != 1) {
@@ -478,14 +574,14 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
 
     SizeVector strides(blkDims.size());
 
-    if (layout == Layout::NHWC || layout == Layout::CHW) {
+    if (layout == Layout::NHWC || layout == Layout::NDHWC || layout == Layout::CHW) {
         for (size_t i = 0; i < order.size(); i++) {
             strides[i] = static_cast<size_t>(blkInfo.strides[0][order[i]]);
         }
     } else {
         strides[blkDims.size() - 1] = 1;
         for (size_t i = 2; i <= order.size(); i++) {
-            if (blkDims.size() - i < getDims().ndims()) {
+            if (blkDims.size() - i < dims.size()) {
                 strides[blkDims.size() - i] = static_cast<size_t>(blkInfo.strides[0][order[blkDims.size() - i]]);
             } else {
                 strides[blkDims.size() - i] = strides[blkDims.size() - i + 1] * blkDims[blkDims.size() - i + 1];
@@ -494,13 +590,13 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
     }
 
     for (size_t i = 0; i < blkDims.size() && i < TENSOR_MAX_DIMS; i++) {
-        if (i < getDims().ndims())
+        if (i < dims.size())
             offsetsForDims.push_back(blkInfo.offset_padding_to_data[i]);
         else
             offsetsForDims.push_back(0);
     }
 
-    TensorDesc tensorDesc(precision, getDims().ToSizeVector(), {blkDims, order, offset, offsetsForDims, strides});
+    TensorDesc tensorDesc(precision, dims, {blkDims, order, offset, offsetsForDims, strides});
 
     tensorDesc.setLayout(layout);
     return tensorDesc;
@@ -543,9 +639,15 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
         case NCHW:
             mkldnnFormat = memory::format::nchw;
             break;
+        case NCDHW:
+            mkldnnFormat = memory::format::ncdhw;
+            break;
         case NHWC:
             mkldnnFormat = memory::format::nhwc;
             break;
+        case NDHWC:
+            mkldnnFormat = memory::format::ndhwc;
+            break;
         case OIHW:
             mkldnnFormat = memory::format::oihw;
             break;
@@ -553,6 +655,11 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
             mkldnnFormat = memory::format::x;
             break;
         case CHW:
+            if (order == SizeVector{0, 1, 2})
+                mkldnnFormat = memory::format::tnc;
+            else if (order == SizeVector{1, 0, 2})
+                mkldnnFormat = memory::format::ntc;
+            else
                 mkldnnFormat = memory::format::blocked;
             break;
         case HW:
@@ -560,32 +667,41 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
             mkldnnFormat = memory::format::nc;
             break;
         case BLOCKED:
+            mkldnnFormat = memory::format::blocked;
             if (realDims.ndims() == 1) {
                 mkldnnFormat = memory::format::x;
-                break;
             } else if (realDims.ndims() == 2) {
                 mkldnnFormat = memory::format::nc;
-                break;
             } else if (realDims.ndims() == 4) {
                 if (order.size() == 5 && order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 1) {
                     if (blkdDims[4] == 8) {
                         mkldnnFormat = memory::format::nChw8c;
-                        break;
                     } else if (blkdDims[4] == 16) {
                         mkldnnFormat = memory::format::nChw16c;
-                        break;
                     }
                 } else if (order.size() == 4) {
                     if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3) {
                         mkldnnFormat = memory::format::nchw;
-                        break;
                     } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 1) {
                         mkldnnFormat = memory::format::nhwc;
-                        break;
+                    }
+                }
+            } else if (realDims.ndims() == 5) {
+                if (order.size() == 6 &&
+                        order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4 && order[5] == 1) {
+                    if (blkdDims[5] == 8) {
+                        mkldnnFormat = memory::format::nCdhw8c;
+                    } else if (blkdDims[5] == 16) {
+                        mkldnnFormat = memory::format::nCdhw16c;
+                    }
+                } else if (order.size() == 5) {
+                    if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4) {
+                        mkldnnFormat = memory::format::ncdhw;
+                    } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 4 && order[4] == 1) {
+                        mkldnnFormat = memory::format::ndhwc;
                     }
                 }
             }
-            mkldnnFormat = memory::format::blocked;
             break;
         case CN:
             mkldnnFormat = memory::format::blocked;
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
index a5329ee5f..37578e5ff 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -108,6 +107,7 @@ public:
 
     static bool IsPlainFormat(mkldnn::memory::format format);
     static mkldnn::memory::format GetPlainFormat(mkldnn::memory::dims dims);
+    static InferenceEngine::Layout GetPlainLayout(mkldnn::memory::dims dims);
     static bool isConsistant(mkldnn::memory::dims dims, mkldnn::memory::format format);
     static mkldnn::memory::format Convert(const InferenceEngine::Layout layout);
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index 7bda59d0d..73975b71e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,6 +9,8 @@
 #include <vector>
 #include <string>
 #include <limits>
+#include <cstdint>
+#include <unordered_map>
 
 #include <nodes/mkldnn_batchnorm_node.h>
 #include <nodes/mkldnn_concat_node.h>
@@ -17,6 +18,7 @@
 #include <nodes/mkldnn_crop_node.h>
 #include <nodes/mkldnn_deconv_node.h>
 #include <nodes/mkldnn_eltwise_node.h>
+#include <nodes/mkldnn_gemm_node.h>
 #include <nodes/mkldnn_fullyconnected_node.h>
 #include <nodes/mkldnn_generic_node.h>
 #include <nodes/mkldnn_input_node.h>
@@ -35,8 +37,9 @@
 #include <nodes/mkldnn_memory_node.hpp>
 #include <nodes/mkldnn_rnn.h>
 #include <mkldnn_types.h>
-
 #include "mkldnn_extension_utils.h"
+#include "mkldnn_plugin.h"
+#include "ie_memcpy.h"
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -52,6 +55,7 @@ MKLDNNNode::Register<MKLDNNConvolutionNode> MKLDNNConvolutionNode::reg;
 MKLDNNNode::Register<MKLDNNCropNode> MKLDNNCropNode::reg;
 MKLDNNNode::Register<MKLDNNDeconvolutionNode> MKLDNNDeconvolutionNode::reg;
 MKLDNNNode::Register<MKLDNNEltwiseNode> MKLDNNEltwiseNode::reg;
+MKLDNNNode::Register<MKLDNNGemmNode> MKLDNNGemmNode::reg;
 MKLDNNNode::Register<MKLDNNFullyConnectedNode> MKLDNNFullyConnectedNode::reg;
 MKLDNNNode::Register<MKLDNNInputNode> MKLDNNInputNode::reg;
 MKLDNNNode::Register<MKLDNNLrnNode> MKLDNNLrnNode::reg;
@@ -358,6 +362,8 @@ std::vector<memory::format> MKLDNNNode::getAvailableFormatsForDims(const MKLDNND
         return {memory::format::nc};
     else if (dims.ndims() == 4)
         return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c};
+    else if (dims.ndims() == 5)
+        return {memory::format::ncdhw, memory::format::nCdhw8c, memory::format::nCdhw16c};
     return {memory::format::any};
 }
 
@@ -506,7 +512,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
 
     size_t offset = blb->byteSize();
     checkSize(intBuffSize, offset);
-    memcpy(data, blb->buffer(), blb->byteSize());
+    ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize());
     data += blb->byteSize();
     for (const auto &merged : getMergeWith()) {
         wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(merged->getCnnLayer().get());
@@ -519,7 +525,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
             THROW_IE_EXCEPTION << "Cannot get internal blob layer for node " << getName() << ".";
         offset += blb->byteSize();
         checkSize(intBuffSize, offset);
-        memcpy(data, blb->buffer(), blb->byteSize());
+        ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize());
         data += blb->byteSize();
     }
 
@@ -545,13 +551,32 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri
 
     internalBlobMemory.clear();
     for (size_t i = 0; i < internalBlobs.size(); i++) {
-        auto& internalBlob = internalBlobs[i];
-        internalBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(engine)));
-
-        internalBlobMemory[i]->Create(intDescs[i]);
-        MKLDNNMemory memory(engine);
-        memory.Create(MKLDNNMemoryDesc(internalBlob->getTensorDesc()), internalBlob->buffer());
-        internalBlobMemory[i]->SetData(memory);
+        const auto &internalBlob = internalBlobs[i];
+
+        const uint64_t data_hash =  Engine::GetWeightsSharing().GetHashFunc().hash(internalBlob->buffer(), internalBlob->byteSize());
+        const std::string string_hash = name + "_" + std::to_string(i)
+                                     + "_" + std::to_string(internalBlob->byteSize())
+                                     + "_" + std::to_string(data_hash);
+        MKLDNNMemoryPtr ptr =
+                Engine::GetWeightsSharing().findOrCreate(string_hash, [&] () {
+                    MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(engine));
+                    _ptr->Create(intDescs[i]);
+                    MKLDNNMemory memory(engine);
+
+                    auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
+                    auto newFormat = newDesc.getFormat();
+                    if (newFormat == mkldnn::memory::ncdhw) {
+                        newFormat = mkldnn::memory::goihw;
+                    }
+                    if (newFormat == mkldnn::memory::nchw) {
+                        newFormat = mkldnn::memory::oihw;
+                    }
+                    memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
+                    auto aformat = memory.GetFormat();
+                    _ptr->SetData(memory);
+                    return _ptr;
+                });
+        internalBlobMemory.push_back(ptr);
     }
 }
 
@@ -648,6 +673,8 @@ std::string MKLDNNNode::typeToStr(Type type) {
             return "Pooling";
         case FullyConnected:
             return "FullyConnected";
+        case Gemm:
+            return "Gemm";
         case SoftMax:
             return "SoftMax";
         case Split:
@@ -682,6 +709,9 @@ std::string MKLDNNNode::typeToStr(Type type) {
             return "MemoryInput";
         case RNN:
             return "RNN";
+        case LSTMCell:
+            return "LSTMCell";
+
         default:
             return "Unknown";
     }
@@ -838,17 +868,18 @@ InferenceEngine::TensorDesc MKLDNNNode::getConfiguredOutputDesc(const InferenceE
 
 void MKLDNNNode::initOptimalPrimitiveDescriptor() {
     auto config = getSelectedPrimitiveDescriptor()->getConfig();
-    if (isInitConfig(config))
-        return;
-
-    for (size_t i = 0; i < config.inConfs.size(); i++) {
-        config.inConfs[i].desc = getConfiguredInputDesc(config, i);
-    }
+    if (!isInitConfig(config)) {
+        for (size_t i = 0; i < config.inConfs.size(); i++) {
+            config.inConfs[i].desc = getConfiguredInputDesc(config, i);
+        }
 
-    for (size_t i = 0; i < config.outConfs.size(); i++) {
-        config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+        for (size_t i = 0; i < config.outConfs.size(); i++) {
+            config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+        }
+        initDescriptor(config);
+    } else if (getType() != RNN && getType() != LSTMCell) {
+        initDescriptor(config);
     }
-    initDescriptor(config);
 }
 
 bool MKLDNNNode::isInitConfig(const InferenceEngine::LayerConfig& config) const {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index acfe8e167..fe71c665f 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,6 +48,7 @@ enum Type {
     Concatenation,
     Power,
     Eltwise,
+    Gemm,
     Crop,
     Reshape,
     Tile,
@@ -60,6 +60,7 @@ enum Type {
     Copy,
     MemoryOutput,
     MemoryInput,
+    LSTMCell,
     RNN
 };
 
@@ -86,6 +87,7 @@ static Type TypeFromName(const std::string type) {
             { "Pooling", Pooling },
             { "FullyConnected", FullyConnected },
             { "InnerProduct", FullyConnected },
+            { "Gemm", Gemm },
             { "Softmax", SoftMax },
             { "SoftMax", SoftMax },
             { "Split", Split },
@@ -103,6 +105,7 @@ static Type TypeFromName(const std::string type) {
             { "Flatten", Flatten },
             { "Permute", Permute },
             { "Copy", Copy },
+            { "LSTMCell", LSTMCell },
             { "RNN", RNN },
             { "MemoryInput", MemoryInput},  // for construction from name ctor, arbitrary name is used
             { "Memory", MemoryOutput },  // for construction from layer ctor
@@ -191,6 +194,10 @@ public:
         return mergedWith;
     }
 
+    const std::vector <MKLDNNNodePtr> &getFusedWith() {
+        return fusedWith;
+    }
+
     const std::string getName() const {
         return name;
     }
@@ -317,7 +324,7 @@ protected:
         this->type = type;
     }
 
-    int getMaxBatch();
+    virtual int getMaxBatch();
 
     virtual InferenceEngine::TensorDesc getConfiguredInputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const;
     virtual InferenceEngine::TensorDesc getConfiguredOutputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const;
@@ -350,6 +357,8 @@ protected:
     MKLDNNPrimitive prim;
     std::vector<MKLDNNDescriptor> descs;
 
+    InferenceEngine::Blob::Ptr ext_scales;
+
     friend class MKLDNNEdge;
     friend class MKLDNNGraph;
     friend class MKLDNNGraphOptimizer;
@@ -371,8 +380,9 @@ protected:
     public:
         Register() {
             Registry::RegisterNode(
-                Registry::CreatorByLayerFunction([](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) -> MKLDNNNode * {
-                    return new To(layer, eng); } ) );
+                Registry::CreatorByLayerFunction(
+                        [](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng)
+                        -> MKLDNNNode* { return new To(layer, eng); } ) );
         }
     };
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
index 3b51c974f..35a965afa 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,9 @@
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 
+MKLDNNWeightsSharing Engine::weightsSharing;
+const SimpleDataHash MKLDNNWeightsSharing::simpleCRC;
+
 InferenceEngine::ExecutableNetworkInternal::Ptr
 Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config) {
     auto specifiedDevice = network.getTargetDevice();
@@ -25,8 +27,12 @@ Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map
     network.getInputsInfo(_networkInputs);
     for (auto ii : _networkInputs) {
         auto input_precision = ii.second->getInputPrecision();
-        if (input_precision != InferenceEngine::Precision::U16 && input_precision != InferenceEngine::Precision::I16
-            && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8) {
+        if (input_precision != InferenceEngine::Precision::FP32 &&
+            input_precision != InferenceEngine::Precision::I32 &&
+            input_precision != InferenceEngine::Precision::U16 &&
+            input_precision != InferenceEngine::Precision::I16 &&
+            input_precision != InferenceEngine::Precision::I8 &&
+            input_precision != InferenceEngine::Precision::U8) {
             THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
                                << "Input image format " << input_precision << " is not supported yet...";
         }
@@ -86,7 +92,7 @@ void Engine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string
 INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin*& plugin, ResponseDesc *resp) noexcept {
     try {
         plugin = make_ie_compatible_plugin(
-                {{1, 4},
+                {{1, 5},
 #ifdef MKL_VERSION
                  MKL_VERSION,
 #else
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
index 482405a16..383feaa21 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,11 +7,59 @@
 #include "mkldnn_graph.h"
 #include <string>
 #include <map>
+#include <unordered_map>
 #include <memory>
+#include <functional>
 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
 
 namespace MKLDNNPlugin {
 
+class SimpleDataHash {
+public:
+    SimpleDataHash() {
+        for (int i = 0; i < kTableSize; i++) {
+            uint64_t c = i;
+            for (int j = 0; j < 8; j++)
+                c = ((c & 1) ? 0xc96c5795d7870f42 : 0) ^ (c >> 1);
+            table[i] = c;
+        }
+    }
+    // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
+    uint64_t hash(const unsigned char* data, size_t size) const {
+        uint64_t crc = 0;
+        for (size_t idx = 0; idx < size; idx++)
+            crc = table[(unsigned char)crc ^ data[idx]] ^ (crc >> 8);
+
+        return ~crc;
+    }
+
+protected:
+    static const int kTableSize = 256;
+    uint64_t table[kTableSize];
+};
+
+class MKLDNNWeightsSharing {
+public:
+    MKLDNNMemoryPtr findOrCreate(const std::string& name_hash,
+                             std::function<MKLDNNMemoryPtr(void)> create) {
+        std::unique_lock<std::mutex> lock(guard);
+        auto found = sharedWeights.find(name_hash);
+
+        MKLDNNMemoryPtr ptr;
+        if (found == sharedWeights.end() || !(ptr = found->second.lock())) {
+            ptr = create();
+            sharedWeights[name_hash] = ptr;
+        }
+        return ptr;
+    }
+    static const SimpleDataHash& GetHashFunc () { return simpleCRC; }
+
+protected:
+    std::unordered_map<std::string, std::weak_ptr<MKLDNNMemory>> sharedWeights;
+    std::mutex guard;
+    static const SimpleDataHash simpleCRC;
+};
+
 class Engine : public InferenceEngine::InferencePluginInternal {
 public:
     Engine() = default;
@@ -30,16 +77,20 @@ public:
     void SetConfig(const std::map<std::string, std::string> &config) override;
 
     /**
-     * @depricated Use the version with config parameter
+     * @deprecated Use the version with config parameter
      */
     void QueryNetwork(const InferenceEngine::ICNNNetwork& network, InferenceEngine::QueryNetworkResult& res) const override;
     void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
                       const std::map<std::string, std::string>& config, InferenceEngine::QueryNetworkResult& res) const override;
 
+    static MKLDNNWeightsSharing& GetWeightsSharing() { return weightsSharing; }
 
 private:
     Config engConfig;
     MKLDNNExtensionManager::Ptr extensionManager = std::make_shared<MKLDNNExtensionManager>();
+
+protected:
+    static MKLDNNWeightsSharing weightsSharing;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
index 6fa73c52f..f9e59f2cc 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
index 5bf983496..075afff9e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp
new file mode 100644
index 000000000..a5198377c
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp
@@ -0,0 +1,372 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <map>
+#include <vector>
+#include <limits>
+#include <chrono>
+#include <climits>
+#include <memory>
+
+#include "mkldnn_graph.h"
+#include "ie_parallel.hpp"
+#include "mkldnn_streams.h"
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+namespace MKLDNNPlugin {
+
+thread_local MultiWorkerTaskContext MultiWorkerTaskExecutor::ptrContext;
+
+bool check_env_variables() {
+#if IE_THREAD == IE_THREAD_OMP
+    return MKLDNNPlugin::cpu::checkOpenMpEnvVars(false);
+#else
+    return false;
+#endif
+}
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+/* Get the cores affinity mask for the current process */
+bool get_process_mask(int& ncpus, cpu_set_t*& mask) {
+    for (ncpus = sizeof(cpu_set_t) / CHAR_BIT; ncpus < 1024 /* reasonable limit of #cores*/; ncpus <<= 1) {
+        mask = CPU_ALLOC(ncpus);
+        if (!mask) return false;
+
+        const size_t size = CPU_ALLOC_SIZE(ncpus);
+        CPU_ZERO_S(size, mask);
+        const int err = sched_getaffinity(getpid(), size, mask);
+        // the result fits the mask
+        if (!err) break;
+        // mask size is not enough
+        CPU_FREE(mask);
+        mask = NULL;
+        // other error
+        if (errno != EINVAL) break;
+    }
+    if (!mask) {
+        return false;
+    }
+    return true;
+}
+/* Pin current thread to a set of cores determined by the mask. */
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) {
+    return 0 == sched_setaffinity(0, ncores, proc_mask);
+}
+/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask.
+ * The function can also handle the hyper-threading (by populating the physical cores first) */
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) {
+    const size_t size = CPU_ALLOC_SIZE(ncores);
+    const int num_cpus = CPU_COUNT_S(size, proc_mask);
+    thr_idx %= num_cpus;  // To limit unique number in [; num_cpus-1] range
+
+    // Place threads with specified step
+    int cpu_idx = 0;
+    for (int i = 0, offset = 0; i < thr_idx; ++i) {
+        cpu_idx += hyperthreads;
+        if (cpu_idx >= num_cpus)
+            cpu_idx = ++offset;
+    }
+
+    // Find index of 'cpu_idx'-th bit that equals to 1
+    int mapped_idx = -1;
+    while (cpu_idx >= 0) {
+        if (CPU_ISSET_S(++mapped_idx, size, proc_mask))
+            --cpu_idx;
+    }
+
+    cpu_set_t *target_mask = CPU_ALLOC(ncores);
+    CPU_ZERO_S(size, target_mask);
+    CPU_SET_S(mapped_idx, size, target_mask);
+    bool res = pin_current_thread_by_mask(size, target_mask);
+    CPU_FREE(target_mask);
+    return res;
+}
+#else   // no threads pinning/binding on Win/MacOS
+bool get_process_mask(int& ncpus, cpu_set_t*& mask) {
+    ncpus = 0;
+    mask =  nullptr;
+    return false;
+}
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) {
+    return false;
+}
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) {
+    return false;
+}
+#endif  // !(defined(__APPLE__) || defined(_WIN32))
+
+MultiWorkerTaskExecutor::MultiWorkerTaskExecutor(const std::vector<Task::Ptr>& init_tasks, std::string name) :
+        _isStopped(false), _name(name), _initCount(0) {
+    for (auto t : init_tasks) {
+        _threads.push_back(std::thread([&, t] {
+            // initialization (no contention, every worker thread is doing it's own task)
+            t->runNoThrowNoBusyCheck();
+            _initCount++;
+
+            while (!_isStopped) {
+                bool isQueueEmpty;
+                Task::Ptr currentTask = nullptr;
+                {  // waiting for the new task or for stop signal
+                    std::unique_lock<std::mutex> lock(_queueMutex);
+                    _queueCondVar.wait(lock, [&]() { return !_taskQueue.empty() || _isStopped; });
+                    isQueueEmpty = _taskQueue.empty();
+                    if (!isQueueEmpty) {
+                        currentTask = _taskQueue.front();
+                        _taskQueue.pop();
+                        isQueueEmpty = _taskQueue.empty();
+                    }
+                }
+                if (currentTask)
+                    currentTask->runNoThrowNoBusyCheck();
+                if (_isStopped)
+                    break;
+                if (isQueueEmpty)  // notify dtor, that all tasks were completed
+                    _queueCondVar.notify_all();
+            }
+        }));
+    }
+    while (_initCount != init_tasks.size()) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+}
+
+MultiWorkerTaskExecutor::~MultiWorkerTaskExecutor() {
+    {
+        std::unique_lock<std::mutex> lock(_queueMutex);
+        if (!_taskQueue.empty()) {
+            _queueCondVar.wait(lock, [this]() { return _taskQueue.empty(); });
+        }
+        _isStopped = true;
+        _queueCondVar.notify_all();
+    }
+    for (auto& thread : _threads) {
+        if (thread.joinable()) {
+            thread.join();
+        }
+    }
+}
+
+bool MultiWorkerTaskExecutor::startTask(Task::Ptr task) {
+    if (!task->occupy()) return false;
+    std::unique_lock<std::mutex> lock(_queueMutex);
+    _taskQueue.push(task);
+    _queueCondVar.notify_one();
+    return true;
+}
+
+MKLDNNPlugin::MKLDNNGraphlessInferRequest::MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs,
+                                                                       InferenceEngine::OutputsDataMap networkOutputs)
+        : InferRequestInternal(networkInputs, networkOutputs), m_curBatch(-1) {
+    // Allocate all input blobs
+    for (const auto& it : networkInputs) {
+        InferenceEngine::Blob::Ptr blob;
+        GetBlob(it.first.c_str(), blob);
+    }
+    // Allocate all output blobs
+    for (const auto& it : networkOutputs) {
+        InferenceEngine::Blob::Ptr blob;
+        GetBlob(it.first.c_str(), blob);
+    }
+}
+
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() {
+    IE_PROFILING_AUTO_SCOPE(MKLDNN_INFER)
+
+    auto infer = [this] {
+        IE_ASSERT(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph != nullptr);
+        MKLDNNGraph::Ptr graph = MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph;
+        if (!graph->IsReady())
+            THROW_IE_EXCEPTION << "Network not loaded.";
+        if (m_curBatch > 0 && !graph->getProperty().enableDynamicBatch)
+            THROW_IE_EXCEPTION << "Dynamic batch is not enabled.";
+
+        if (m_curBatch > graph->getProperty().batchLimit)
+            THROW_IE_EXCEPTION << "Invalid dynamic batch size " << m_curBatch <<
+                               " for this request.";
+
+        // execute input pre-processing.
+        execDataPreprocessing(_inputs);
+
+        // need to retain converted blobs until infer finish
+        std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
+        for (auto input : _inputs) {
+            if (!_networkInputs[input.first]) {
+                THROW_IE_EXCEPTION <<
+                                   "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
+                                   << input.first;
+            }
+            InferenceEngine::Blob::Ptr iconv;
+            InferenceEngine::TBlob<float> *in_f = nullptr;
+            switch (input.second->precision()) {
+                case InferenceEngine::Precision::FP32:
+                    graph->PushInputData(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::U16:
+                    // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
+                    iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                            InferenceEngine::Precision::FP32,
+                            input.second->getTensorDesc().getLayout(), input.second->dims());
+                    convertedInputs.push_back(iconv);
+                    iconv->allocate();
+                    in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                    InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
+                    graph->PushInputData(input.first, iconv);
+                    break;
+                case InferenceEngine::Precision::I16:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
+                        graph->PushInputData(input.first, iconv);
+                    } else {
+                        // Instead we can send I16 directly
+                        graph->PushInputData(input.first, input.second);
+                    }
+                    break;
+                case InferenceEngine::Precision::U8:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+                        graph->PushInputData(input.first, iconv);
+                    } else {
+                        // Instead we can send I8 directly
+                        graph->PushInputData(input.first, input.second);
+                    }
+                    break;
+                default:
+                    THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+            }
+        }
+        graph->Infer(m_curBatch);
+        graph->PullOutputData(_outputs);
+        if (graph->getProperty().collectPerfCounters) {
+            m_perfMap.clear();
+            graph->GetPerfData(m_perfMap);
+        }
+    };
+#if IE_THREAD == IE_THREAD_TBB
+    auto_scope_observing observer(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrObserver);
+    // a TBB arena is made "this" for Infer call via executing lambda for the arena
+    MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrArena->execute([&] { infer(); });
+#else
+    infer();
+#endif
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetPerformanceCounts(
+        std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const {
+    perfMap = m_perfMap;
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) {
+    // ROI blob is returned only if it was set previously.
+    auto it = _preProcData.find(name);
+    if (it != _preProcData.end()) {
+        data = it->second.getRoiBlob();
+        return;
+    }
+
+    if (_inputs.find(name) != _inputs.end()) {
+        data = _inputs[name];
+        checkBlob(data, name, true);
+        return;
+    } else if (_networkInputs.find(name) != _networkInputs.end()) {
+        InferenceEngine::Layout l = _networkInputs[name]->getLayout();
+        InferenceEngine::Precision p = _networkInputs[name]->getPrecision();
+        InferenceEngine::SizeVector dims = _networkInputs[name]->getTensorDesc().getDims();
+
+        InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l);
+        _inputs[name] = data = make_blob_with_precision(desc);
+        _inputs[name]->allocate();
+        checkBlob(data, name, true);
+        return;
+    }
+
+    if (_outputs.find(name) != _outputs.end()) {
+        data = _outputs[name];
+        checkBlob(data, name, false);
+        return;
+    } else if (_networkOutputs.find(name) != _networkOutputs.end()) {
+        InferenceEngine::Layout l = _networkOutputs[name]->getLayout();
+        InferenceEngine::Precision p = _networkOutputs[name]->getPrecision();
+        InferenceEngine::SizeVector dims = _networkOutputs[name]->getTensorDesc().getDims();
+
+        InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l);
+        _outputs[name] = data = make_blob_with_precision(desc);
+        _outputs[name]->allocate();
+        checkBlob(data, name, false);
+        return;
+    }
+
+    THROW_IE_EXCEPTION << "Cannot find blob with name: " << name;
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) {
+    if (!data)
+        THROW_IE_EXCEPTION << NOT_ALLOCATED_str << "Failed to set empty blob with name: \'" << name << "\'";
+    if (data->buffer() == nullptr)
+        THROW_IE_EXCEPTION << "Input data was not allocated. Input name: \'" << name << "\'";
+    if (name == nullptr) {
+        THROW_IE_EXCEPTION << NOT_FOUND_str + "Failed to set blob with empty name";
+    }
+    InferenceEngine::InputInfo::Ptr foundInput;
+    InferenceEngine::DataPtr foundOutput;
+    size_t dataSize = data->size();
+    if (findInputAndOutputBlobByName(name, foundInput, foundOutput)) {
+        if (foundInput->getInputPrecision() != data->precision()) {
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Failed to set Blob with precision "
+                               << data->precision();
+        }
+
+        if (foundInput->getPreProcess().getResizeAlgorithm() != InferenceEngine::ResizeAlgorithm::NO_RESIZE) {
+            // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing.
+            _preProcData[name].setRoiBlob(data);
+        } else {
+            size_t inputSize = InferenceEngine::details::product(foundInput->getDims());
+            if (dataSize != inputSize) {
+                THROW_IE_EXCEPTION << "Input blob size is not equal network input size ("
+                                   << dataSize << "!=" << inputSize << ").";
+            }
+            _inputs[name] = data;
+        }
+    } else {
+        size_t outputSize = InferenceEngine::details::product(foundOutput->getDims());
+        if (dataSize != outputSize) {
+            THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
+                               << dataSize << "!=" << outputSize << ").";
+        }
+        if (foundOutput->getPrecision() != data->precision()) {
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str
+                               << "Failed to set Blob with precision not corresponding to user output precision";
+        }
+        _outputs[name] = data;
+    }
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBatch(int new_batch) {
+    if (new_batch < 1) {
+        THROW_IE_EXCEPTION << "Invalid dynamic batch size " << new_batch <<
+                           " for this request.";
+    }
+    m_curBatch = new_batch;
+}
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.h b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h
new file mode 100644
index 000000000..31558fee2
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h
@@ -0,0 +1,177 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <atomic>
+#include <map>
+#include <queue>
+#include <memory>
+#include <climits>
+#include <cpp_interfaces/impl/ie_infer_request_internal.hpp>
+#include <cpp_interfaces/ie_task_executor.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn/omp_manager.h"
+
+/* CPU "streams" implement a feature that allows multiple Infer Requests to be efficiently run simultaneously.
+ * To avoid potential oversubscription the CPU execution resources are divided accordingly.
+ * The feature enables much better performance for the networks that originally do not scale well with #threads
+ * even for a large batches. Examples are lightweight topologies or topologies with many sequential/mem-bound/etc or
+ * otherwise non-scalable layers. This is especially pronounced for many-core (e.g. server) machines.
+ * This is rather throughput-oriented feature,because running multiple requests in parallel might increase the latency
+ * of each request.
+ * Additionally, the streams help to relax the need for the large batch to improve the throughput and simplify the
+ * application logic, helping to saturate the CPU by multiple requests instead.
+ * Implementation-wise, the "streams" constitute the following:
+ *  - Pure "graph-less" Infer Requests that are not connected to the specific MKLDNNGraph (which is regular/legacy approach)
+ *  - Just like regular requests, the graph-less go to the common (per ExecutableNetwork) queue
+ *  - But unlike conventional case, there are multiple threads that grab the requests (see MultiWorkerTaskExecutor)
+ *  - So every stream is in fact is independent "worker" thread that monitors the queue.
+ *  - Every worker thread (stream) has it's own copy of the graph (which handles intermediate data required for execution)
+ *  - While the Infer Requests just keep only input/output data
+*/
+namespace MKLDNNPlugin {
+
+using namespace InferenceEngine;
+class MKLDNNGraph;
+class pinning_observer;
+
+/* This structure handles an "execution context" - data required to execute an Infer Request.
+ * This includes graph (which handles the intermediate data) and arena/observer for the TBB */
+struct MultiWorkerTaskContext {
+    std::shared_ptr<MKLDNNGraph> ptrGraph;
+};
+
+#if defined(__APPLE__) || defined(_WIN32)
+typedef void cpu_set_t;
+#define CPU_FREE(cpuset)
+// notice that functions below are just stubs for OSs other than Linux
+#endif
+/* Check whether any affinity-related env variables are set (relevant for the OpenMP) */
+bool check_env_variables();
+/* Get the cores affinity mask for the current process */
+bool get_process_mask(int& ncpus, cpu_set_t*& mask);
+/* Pin current thread to a set of cores determined by the mask. */
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask);
+/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask.
+ * The function can also handle the hyper-threading (by populating the physical cores first) */
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask);
+
+#if IE_THREAD == IE_THREAD_TBB
+/* Simple observer that handles pinning threads to the cores, it serves as a callback for threads entering the arena. */
+class pinning_observer: public tbb::task_scheduler_observer {
+    cpu_set_t *mask;
+    int ncpus;
+    int stream_id, threads_per_stream;
+    const int pinning_step;
+
+public:
+    pinning_observer(tbb::task_arena& _arena, int _stream_id, int _threads_per_stream, int _pinning_step = 1) :
+            tbb::task_scheduler_observer(_arena),
+            stream_id(_stream_id), threads_per_stream(_threads_per_stream), pinning_step(_pinning_step) {
+        get_process_mask(ncpus, mask);
+    }
+
+    void on_scheduler_entry(bool) override {
+        if (!mask) return;
+        int thread_idx = tbb::task_arena::current_thread_index();
+        int thr_idx = stream_id * threads_per_stream + thread_idx;
+        // pin thread to the vacant slot
+        pin_thread_to_vacant_core(thr_idx, pinning_step, ncpus, mask);
+    }
+
+    void on_scheduler_exit(bool) override {
+        if (!mask) return;
+        // reset the thread's mask (to the original process mask)
+        pin_current_thread_by_mask(ncpus, mask);
+    }
+
+    ~pinning_observer() {
+        if (mask)
+            CPU_FREE(mask);
+    }
+};
+
+class auto_scope_observing {
+public:
+     explicit auto_scope_observing(std::unique_ptr<tbb::task_scheduler_observer>&  _p) : p(_p) {
+         if (p)
+             p->observe(true);
+     }
+     ~auto_scope_observing() {
+         if (p)
+            p->observe(false);
+     }
+
+protected:
+    std::unique_ptr<tbb::task_scheduler_observer>&  p;
+};
+#endif  // IE_THREAD == IE_THREAD_TBB
+
+/* Class wrapping multiple worker threads that monitors the same queue with Infer Requests. */
+class MultiWorkerTaskExecutor : public ITaskExecutor {
+public:
+    typedef std::shared_ptr<MultiWorkerTaskExecutor> Ptr;
+
+    explicit MultiWorkerTaskExecutor(const std::vector<Task::Ptr>&, std::string name = "Default");
+
+    ~MultiWorkerTaskExecutor();
+
+    /**
+    * @brief Adds task for execution and notifies one of the working threads about the new task.
+    * @note can be called from multiple threads - tasks will be added to the queue and executed one-by-one in FIFO mode.
+    * @param task - shared pointer to the task
+    *  @return true if succeed to add task, otherwise - false
+    */
+    bool startTask(Task::Ptr task) override;
+
+    static thread_local MultiWorkerTaskContext ptrContext;
+
+private:
+    std::vector<std::thread> _threads;
+    std::mutex _queueMutex;
+    std::condition_variable _queueCondVar;
+    std::queue<Task::Ptr> _taskQueue;
+    std::atomic<bool> _isStopped;
+    std::string _name;
+    std::atomic<int> _initCount;
+};
+
+/* Pure Infer Requests - just input and output data. */
+class MKLDNNGraphlessInferRequest : public InferenceEngine::InferRequestInternal {
+public:
+    typedef std::shared_ptr<MKLDNNGraphlessInferRequest> Ptr;
+    explicit MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs,
+                                         InferenceEngine::OutputsDataMap networkOutputs);
+
+    void InferImpl() override;
+
+    void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
+
+    /**
+     * @brief Given optional implementation of setting blob to avoid need for it to be implemented by plugin
+     * @param name - a name of input or output blob.
+     * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size.
+     */
+    void SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) override;
+
+    /**
+     * @brief Given optional implementation of getting blob to avoid need for it to be implemented by plugin
+     * @param name - a name of input or output blob.
+     * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size.
+     */
+    void GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) override;
+
+
+    void SetBatch(int batch = -1) override;
+
+private:
+    int m_curBatch;
+    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> m_perfMap;
+};
+
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
index f48ada45e..d23b12e3b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -16,6 +15,7 @@ using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 
+// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu
 caseless_map<std::string, std::function<void(GenericLayer*, mkldnn::algorithm&, float&, float&)>> MKLDNNActivationNode::initializers = {
         {"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
             alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
index 508d8c796..9dac1507c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
index 502a804d3..173df1c24 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
@@ -1,11 +1,11 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_batchnorm_node.h"
 #include "mkldnn_depthwise_node.h"
 #include <mkldnn_extension_utils.h>
+#include "ie_memcpy.h"
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -77,7 +77,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
             THROW_IE_EXCEPTION << "Cannot get weights blob for node " << getName() << ".";
 
         size_t weightsByteSize = blb->byteSize();
-        memcpy(data, blb->buffer(), weightsByteSize);
+        ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
         data += blb->size();
         blb = scshLayer->_biases;
 
@@ -86,7 +86,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
         } else {
             if (weightsByteSize != blb->byteSize())
                 THROW_IE_EXCEPTION << "ScaleShift has incorrect weights!";
-            memcpy(data, blb->buffer(), weightsByteSize);
+            ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
         }
         internalBlobs.push_back(internalBlob);
     }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
index ef948b70f..c7d9d3e17 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
index 1da5d57f5..fd2893e95 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -59,16 +58,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
     if (!supportedPrimitiveDescriptors.empty())
         return;
 
-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-    precision = getCnnLayer()->outData[0]->getPrecision();
+    InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
+    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision);
+    InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision();
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 
-    if (getCnnLayer()->precision == Precision::I8) {
-        inputDataType = memory::data_type::u8;
-        outputDataType = memory::data_type::u8;
-    }
-
     MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
     InferenceEngine::LayerConfig config;
     config.dynBatchSupport = true;
@@ -103,6 +97,16 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
             }
         }
+    } else if (dims.ndims() == 5) {
+        if (dims[1] % 8 == 0) {
+            config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
+            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
+
+            if (dims[1] % 16 == 0) {
+                config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
+                supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
+            }
+        }
     }
 
     if (axis != 1 || hasEltwise)
@@ -110,12 +114,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 
     auto numOfDim = static_cast<size_t>(dstDims.ndims());
 
-    SizeVector order;
-    SizeVector offsets;
+    SizeVector order(numOfDim);
+    SizeVector offsets(numOfDim, 0lu);
     size_t offset = std::numeric_limits<size_t>::max();
     for (size_t i = 0; i < numOfDim; i++) {
-        order.push_back(i);
-        offsets.push_back(0);
+        order[i] = i;
     }
 
     if (this->getCnnLayer()->precision == Precision::I8) {
@@ -135,7 +138,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                 strides[i] = std::numeric_limits<size_t>::max();
             }
 
-            config.outConfs[0].desc = TensorDesc(Precision::U8, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
+            config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(),
+                                                 dstDims.ToSizeVector(),
+                                                 { blkDims, order, offset, offsets, strides });
             for (size_t i = 0; i < getParentEdges().size(); i++) {
                 auto parentEdge = getParentEdgeAt(i);
 
@@ -144,7 +149,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 
                 config.inConfs[i].inPlace = -1;     // Change to 0 here if inplace concat is supported for NHWC in mkldnn
 
-                config.inConfs[i].desc = TensorDesc(Precision::U8, parentEdge->getDims().ToSizeVector(),
+                config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(),
                                                     {blkDims, order, offset, offsets, strides});
             }
 
@@ -174,26 +179,30 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 
         supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 
-        if (numOfDim == 4) {
-            order = {0, 1, 2, 3, 1};
-            offsets = {0, 0, 0, 0, 0};
-            numOfDim = 5;
+        if (numOfDim == 4lu || numOfDim == 5lu) {
+            size_t blkDimsLen = numOfDim + 1;
+            order.resize(blkDimsLen);
+            for (size_t i = 0; i < numOfDim; i++) {
+                order[i] = i;
+            }
+            order[numOfDim] = 1lu;
+            offsets = SizeVector(blkDimsLen, 0lu);
 
-            // nChw8c and nChw16c
-            for (int sizeS : {8, 16}) {
+            // nChw8c, nChw16c, nCdhw8c, nCdhw16c
+            for (size_t sizeS : {8lu, 16lu}) {
                 SizeVector blkDims = dstDims.ToSizeVector();
                 if (blkDims[1] % sizeS)
                     continue;
-                blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+                blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
                 blkDims.push_back(sizeS);
 
-                strides.resize(numOfDim);
-                strides[numOfDim - 1] = 1;
-                for (size_t i = 2; i <= numOfDim; i++) {
-                    if (numOfDim - i < axis) {
-                        strides[numOfDim - i] = std::numeric_limits<size_t>::max();
+                strides.resize(blkDimsLen);
+                strides[blkDimsLen - 1] = 1;
+                for (size_t i = 2lu; i <= blkDimsLen; i++) {
+                    if (blkDimsLen - i < axis) {
+                        strides[blkDimsLen - i] = std::numeric_limits<size_t>::max();
                     } else {
-                        strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1];
+                        strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
                     }
                 }
                 config.outConfs[0].desc = TensorDesc(
@@ -201,13 +210,13 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                         dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
 
                 bool canInplace = true;
-                for (size_t i = 0; canInplace && i < getParentEdges().size(); i++) {
+                for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
                     auto parentEdge = getParentEdgeAt(i);
                     blkDims = parentEdge->getDims().ToSizeVector();
                     if (blkDims[1] % sizeS)
                         canInplace = false;
 
-                    blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+                    blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
                     blkDims.push_back(sizeS);
                     config.inConfs[i].desc =  TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
                                                          {blkDims, order, offset, offsets, strides});
@@ -225,11 +234,6 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
     precision = getCnnLayer()->outData[0]->getPrecision();
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 
-    if (getCnnLayer()->precision == Precision::I8) {
-        inputDataType = memory::data_type::u8;
-        outputDataType = memory::data_type::u8;
-    }
-
     bool hasUnknown = false;
     std::vector<size_t> canSelectPrimitive;
     for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
index 2b5fa898c..9aa51d7cd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
index 109a87fe5..ea1aee821 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -37,18 +36,18 @@ MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr&
         wScale = ws->second;
     }
 
-
     // Trying to find oi-scale
-    lastInInt8Chain = true;
-    auto ois = layer->blobs.find("oi-scale");
-    if (ois != layer->blobs.end()) {
-        // If we can find an o-scale, then the next layer has to be an INT8.
-        lastInInt8Chain = false;
-        oScale = ois->second;
-    } else {
-        // If we can't find an oi-scale then the next layer has to be
-        // an FP32, so we are the last layer in the INT8-chain
-        lastInInt8Chain = true;
+    if (getCnnLayer()->type == "Convolution" && getCnnLayer()->precision == Precision::I8) {
+        auto ois = layer->blobs.find("oi-scale");
+        if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8)
+            && ois == layer->blobs.end()) {
+            THROW_IE_EXCEPTION << "Internal error of graph quantization - missmatch of intermediate scales and next layer type for convolution "
+                << getCnnLayer()->name;
+        }
+        if (ois != layer->blobs.end()) {
+            // If we can find an oi-scale, then the next layer has to be an INT8.
+            oScale = ois->second;
+        }
     }
 }
 
@@ -99,6 +98,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         groupOC /= groupNum;
     }
 
+    weightDims.clear();
     weightDims.push_back(groupOC);
     weightDims.push_back(groupIC);
     for (int i = 1; i <= convLayer->_kernel.size(); i++) {
@@ -141,13 +141,13 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         dilation.push_back(static_cast<int>(convLayer->_dilation[convLayer->_dilation.size() - i]) - 1);
     }
 
-    auto allPads = getConvPaddings(*convLayer);
+    auto allPads = getPaddings(*convLayer);
     invertVectorCopyUtoI(allPads.begin, paddingL);
     invertVectorCopyUtoI(allPads.end, paddingR);
 
     MKLDNNDims weightsDims = MKLDNNDims(weightDims);
 
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < paddingR.size(); i++) {
         int with_group = (isGrouped || isMerged) ? 1 : 0;
         int krn = weightsDims[with_group + 2 + i];
         int src = getParentEdgeAt(0)->getDims()[2 + i];
@@ -176,26 +176,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         }
     }
 
-    if (weights->precision() == Precision::I8) {
-        inputDataType = memory::u8;
-        if (lastInInt8Chain) {
-            outputDataType = memory::f32;
-        } else {
-            // Searching for the last fused node and taking the precision from there
-            Precision p = getCnnLayer()->precision;
-            if (fusedWith.size() > 0 && fusedWith[fusedWith.size() - 1]->getCnnLayer()->type == "ReLU") {
-                p = fusedWith[fusedWith.size() - 1]->getCnnLayer()->precision;
-            }
-
-            if (p == Precision::I8) {
-                outputDataType = memory::s8;
-            } else if (p == Precision::U8) {
-                outputDataType = memory::u8;
-            } else {
-                 THROW_IE_EXCEPTION << "Invalid layer precision for " << getName();
-            }
-        }
-
+    if (this->getCnnLayer()->precision == Precision::I8) {
         MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc);
         MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc);
         createDescriptor({in_candidate}, {out_candidate});
@@ -204,22 +185,48 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         inputDataType = memory::f32;
         outputDataType = memory::f32;
 
-        MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::nchw);
-        MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::nchw);
-        createDescriptor({in_candidate}, {out_candidate});
+        Layout layout = convLayer->input()->getLayout();
 
-        if (IC == 3 || IC == 1) {
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
-            createDescriptor({in_candidate}, {out_candidate});
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+        if (layout == NCHW || layout == NHWC) {
+            MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType,
+                    layout == NCHW ? memory::nchw : memory::nhwc);
+            MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType,
+                    layout == NCHW ? memory::nchw : memory::nhwc);
             createDescriptor({in_candidate}, {out_candidate});
-        } else {
-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
-            createDescriptor({in_candidate}, {out_candidate});
-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+
+            if (IC == 3 || IC == 1) {
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            } else {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            }
+        } else if (layout == NCDHW || layout == NDHWC) {
+            MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType,
+                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
+            MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType,
+                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
             createDescriptor({in_candidate}, {out_candidate});
+
+            if (IC == 3 || IC == 1) {
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            } else {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            }
         }
     }
 }
@@ -231,7 +238,15 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
     for (auto &node : fusedWith) {
         auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
         if (eltwiseNode) {
-            ops.append_sum(1.0);
+            if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
+                auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
+                if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
+                    // currently there is the only one scale while we need scale by channel :(
+                    ops.append_sum(it->second->buffer().as<float*>()[0]);
+                }
+            } else {
+                ops.append_sum(1.0);
+            }
             continue;
         }
 
@@ -252,11 +267,10 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                 PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
                 PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
 
-                int bufferSize = depthwiseNode->isBroadcast() ? 1 : depthwiseDims[0];
                 PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
                                                              depthwiseLayer->_weights->buffer(),
-                                                             bufferSize * MKLDNNExtensionUtils::sizeOfDataType(
-                                                                     memory::data_type::f32));
+                                                             depthwiseLayer->_weights->size() *
+                                                             MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
 
                 if (depthwiseNode->isBroadcast()) {
                     float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
@@ -271,9 +285,8 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                                                                 memory::format::x);
                     PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
                                                                  depthwiseLayer->_biases->buffer(),
-                                                                 bufferSize *
-                                                                 MKLDNNExtensionUtils::sizeOfDataType(
-                                                                         memory::data_type::f32));
+                                                                 depthwiseLayer->_biases->size() *
+                                                                 MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
 
                     if (depthwiseNode->isBroadcast()) {
                         float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
@@ -450,14 +463,15 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
         bdt = memory::s32;
 
         Precision outPrec;
-        if (lastInInt8Chain) {
+        if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
             outPrec = Precision::FP32;
         } else {
             // define precision accordninly normalizer
+            // TODO(amalyshe) do we need to have separate flow for last in int8 chain or not?
             outPrec = outDesc.getPrecision();
         }
 
-        inDesc = TensorDesc(Precision::U8, inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
+        inDesc = TensorDesc(inDesc.getPrecision() , inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
         outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), outputDesc[0].getBlockingDesc());
     }
 
@@ -502,8 +516,8 @@ void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr)
        float* wScaleData = static_cast<float*>(wScale->buffer());
 
        std::vector<float> oScaleDataVector;
-       if (!lastInInt8Chain) {
-           float* oScaleData = static_cast<float*>(oScale->buffer());
+       if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
+           float *oScaleData = static_cast<float *>(oScale->buffer());
 
            for (size_t c = 0; c < wScale->size(); c++) {
                oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
index aa2424194..19191ee45 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -57,8 +56,6 @@ private:
 
     InferenceEngine::ConvolutionLayer* convLayer;
     InferenceEngine::Blob::Ptr wScale, oScale;
-
-    bool lastInInt8Chain;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
index aafa4aec0..8b11c296f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
index 2895e81e8..f74ab297e 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
index 1295e05a5..38ca06ce8 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <vector>
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
+#include <ie_layers_internal.hpp>
 #include "ie_parallel.hpp"
 
 using namespace mkldnn;
@@ -67,18 +67,17 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
                 deconvLayer->_group,
                 deconvLayer->input()->getTensorDesc().getDims()[1] / deconvLayer->_group,
                 deconvLayer->_out_depth / deconvLayer->_group,
-                deconvLayer->_kernel[Y_AXIS],
-                deconvLayer->_kernel[X_AXIS]
         };
         groupNum = deconvLayer->_group;
     } else {
         weightDims = {
                 deconvLayer->input()->getTensorDesc().getDims()[1],
-                deconvLayer->_out_depth,
-                deconvLayer->_kernel[Y_AXIS],
-                deconvLayer->_kernel[X_AXIS]
+                deconvLayer->_out_depth
         };
     }
+    for (int i = 1; i <= deconvLayer->_kernel.size(); i++) {
+        weightDims.push_back(deconvLayer->_kernel[deconvLayer->_kernel.size() - i]);
+    }
 
     internalBlobs.push_back(createInternalBlob(weightDims, true));
 
@@ -86,12 +85,13 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
     for (int i = 1; i <= deconvLayer->_dilation.size(); i++) {
         dilation.push_back(static_cast<int>(deconvLayer->_dilation[deconvLayer->_dilation.size() - i]) - 1);
     }
-    invertVectorCopyUtoI(deconvLayer->_padding, paddingL);
-    invertVectorCopyUtoI(deconvLayer->_pads_end, paddingR);
+    auto allPads = getPaddings(*deconvLayer);
+    invertVectorCopyUtoI(allPads.begin, paddingL);
+    invertVectorCopyUtoI(allPads.end, paddingR);
 
     weightsDims = MKLDNNDims(weightDims);
 
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < paddingR.size(); i++) {
         int with_group = (withGroups) ? 1 : 0;
         int krn = weightsDims[with_group + 2 + i];
         int src = getChildEdgeAt(0)->getDims()[2 + i];
@@ -115,28 +115,46 @@ void MKLDNNDeconvolutionNode::execute(mkldnn::stream strm) {
     }
     if (withBiases) {
         const auto *bias = biases->buffer().as<const float*>();
+        auto biasSize = biases->size();
 
         auto dst = getChildEdgeAt(0)->getBlob();
 
         float *output = dst->buffer().as<float *>() + dst->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        auto dims_size = dst->getTensorDesc().getDims().size();
+        auto layout = dst->layout();
 
         const size_t N = dst->getTensorDesc().getDims()[0];
-        const size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum;
-        const size_t H = dst->getTensorDesc().getDims()[2];
-        const size_t W = dst->getTensorDesc().getDims()[3];
-        const size_t blkC =
-                dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4 ?
-                dst->getTensorDesc().getBlockingDesc().getBlockDims()[4] :
-                1;
+        size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum;
+        if (C < 1) C = 1;
+        const size_t D = dims_size > 4 ? dst->getTensorDesc().getDims()[dims_size - 3] : 1lu;
+        const size_t H = dst->getTensorDesc().getDims()[dims_size - 2];
+        const size_t W = dst->getTensorDesc().getDims()[dims_size - 1];
+        size_t blkC = 1lu;
+        if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5) {
+            blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5 ?
+                   dst->getTensorDesc().getBlockingDesc().getBlockDims()[5] :
+                   1lu;
+        } else if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4) {
+            blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims()[4];
+        }
 
         auto strides = dst->getTensorDesc().getBlockingDesc().getStrides();
+        int output_size = strides[0] * N - dst->getTensorDesc().getBlockingDesc().getOffsetPadding();
 
-        parallel_for4d(N, C, H, W, [&](size_t n, size_t c, size_t h, size_t w) {
+        parallel_for5d(N, C, D, H, W, [&](size_t n, size_t c, size_t d, size_t h, size_t w) {
             for (size_t g = 0; g < groupNum; g++) {
-                const size_t off = n * strides[0] + (g * C + c) * strides[1] + h * strides[2] + w * strides[3];
+                const size_t off = n * strides[0]
+                                 + (g * C + c) * strides[1]
+                                 + d * strides[dims_size - 3]
+                                 + h * strides[dims_size - 2]
+                                 + w * strides[dims_size - 1];
+                if (off >= output_size) continue;
                 auto o = &output[off];
+                int gcb = g * C * blkC + c * blkC;
                 for (int bc = 0; bc < blkC; ++bc) {
-                    o[bc] += bias[c * blkC + bc];
+                    int index = gcb + bc;
+                    if (index < biasSize)
+                        o[bc] += bias[index];
                 }
             }
         });
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
index 244054c26..e32a66a73 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
index 8eadcf824..6b1097a62 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -39,9 +38,20 @@ void MKLDNNDepthwiseNode::getSupportedDescriptors() {
     SizeVector weightDims = { (long unsigned int)parentOutDims[1] };
     MKLDNNDims blocked_weightDims(weightDims);
 
+    auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get());
+    if (wLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << ".";
+
+    InferenceEngine::Blob::Ptr blb = wLayer->_weights;
+    if (blb)
+        realWeightSize = blb->size();
     internalBlobs.push_back(createInternalBlob(weightDims, true));
-    if (isWithBiases())
+    if (isWithBiases()) {
+        InferenceEngine::Blob::Ptr blb = wLayer->_biases;
+        if (blb)
+            realBiasSize = blb->size();
         internalBlobs.push_back(createInternalBlob(weightDims, false));
+    }
 
     for (auto format : getAvailableFormatsForDims(parentOutDims)) {
         MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format};
@@ -66,13 +76,15 @@ void MKLDNNDepthwiseNode::createPrimitive() {
 
     if (isBroadcast()) {
         float broadcastValue = static_cast<float*>(internalBlobMemory[0]->GetData())[0];
-        for (int i = 1; i < internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+        int blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
+        for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) {
             static_cast<float*>(internalBlobMemory[0]->GetData())[i] = broadcastValue;
         }
 
         if (isWithBiases()) {
+            blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
             broadcastValue = static_cast<float*>(internalBlobMemory[1]->GetData())[0];
-            for (int i = 1; i < internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+            for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) {
                 static_cast<float*>(internalBlobMemory[1]->GetData())[i] = broadcastValue;
             }
         }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
index 78ef529f5..16bd3a505 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -50,6 +49,8 @@ private:
     static Register<MKLDNNDepthwiseNode> reg;
 
     mkldnn::algorithm algorithm;
+    size_t realWeightSize = 0;
+    size_t realBiasSize = 0;
     bool withBiases;
     bool broadcast;
 };
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
index 0a051dc52..111196817 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -99,15 +98,9 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
             mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32);
             supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, format));
         } else {
-            THROW_IE_EXCEPTION << "Invalid Eltwise layer precision";
+            THROW_IE_EXCEPTION << "Invalid Eltwise layer precision: " << getCnnLayer()->name;
         }
     }
-
-    if (getCnnLayer()->precision == Precision::I8) {
-        mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8);
-        mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8);
-        supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, mkldnn::memory::format::nhwc));
-    }
 }
 
 void MKLDNNEltwiseNode::createPrimitive() {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
index e206799f4..0395cd432 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
index 20b60c62c..75b814e81 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -60,8 +59,11 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
     } else if (inDims.ndims() == 4) {
         weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
                        static_cast<size_t>(inDims[3])};
+    } else if (inDims.ndims() == 5) {
+        weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
+                       static_cast<size_t>(inDims[3]), static_cast<size_t>(inDims[4])};
     } else {
-        THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 4 or 2, got: "
+        THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4 or 2, got: "
                            << inDims.ndims() << " dims.";
     }
 
@@ -113,10 +115,16 @@ memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::forma
             return memory::format::oi;
         case memory::format::nchw:
             return memory::format::oihw;
+        case memory::format::ncdhw:
+            return memory::format::oidhw;
         case memory::format::nChw8c:
             return memory::format::oIhw8i;
+        case memory::format::nCdhw8c:
+            return memory::format::oIdhw8i;
         case memory::format::nChw16c:
             return memory::format::oIhw16i;
+        case memory::format::nCdhw16c:
+            return memory::format::oIdhw16i;
         default:
             THROW_IE_EXCEPTION << "Unsupported source format for node " << getName();
     }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
index 88259a265..73c06f7ce 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
new file mode 100644
index 000000000..2874d9dfe
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
@@ -0,0 +1,234 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_gemm_node.h"
+#include <ie_layers.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <algorithm>
+#include <cmath>
+#include <mkldnn_types.h>
+#include <mkldnn_extension_utils.h>
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+MKLDNNGemmNode::MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
+
+void MKLDNNGemmNode::getSupportedDescriptors() {
+    auto* gemmLayer = dynamic_cast<GemmLayer*>(getCnnLayer().get());
+
+    if (gemmLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot convert gemm layer.";
+
+    if (getParentEdges().size() != 2 && getParentEdges().size() != 3)
+        THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
+    if (getChildEdges().size() != 1)
+        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
+
+    auto inDims0 = getParentEdgeAt(0)->getDims();
+    auto inDims1 = getParentEdgeAt(1)->getDims();
+    auto outDims = getChildEdgeAt(0)->getDims();
+
+    alpha = gemmLayer->alpha;
+    beta = gemmLayer->beta;
+    transposeA = gemmLayer->transpose_a;
+    transposeB = gemmLayer->transpose_b;
+
+    if ((inDims0.ndims() < 2 || inDims0.ndims() > 4) ||
+        (inDims1.ndims() < 2 || inDims1.ndims() > 4))
+        THROW_IE_EXCEPTION << "Unsupported input dims count for layer " << getName();
+
+    if (outDims.ndims() < 2 || outDims.ndims() > 4)
+        THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName();
+
+    if (inDims0.ndims() != inDims1.ndims() || inDims0.ndims() != outDims.ndims())
+        THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName();
+
+    int nDims = inDims0.ndims();
+    xAxis = nDims - 1;
+    yAxis = nDims - 2;
+
+    if (inDims0[xAxis] != inDims1[yAxis] || inDims0[yAxis] != outDims[yAxis] || inDims1[xAxis] != outDims[xAxis])
+        THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
+
+    isThreeInputs = getParentEdges().size() == 3;
+
+    if (isThreeInputs) {
+        auto inDims2 = getParentEdgeAt(2)->getDims();
+
+        if (inDims2.ndims() < 2 || inDims2.ndims() > 4)
+            THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName();
+
+        if (inDims2.ndims() != outDims.ndims())
+            THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName();
+
+        if (inDims2[yAxis] != outDims[yAxis] || inDims2[xAxis] != outDims[xAxis])
+            THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
+    }
+
+    for (int dim_idx = nDims - 3; dim_idx >= 0; dim_idx--) {
+        if (isThreeInputs) {
+            auto inDims2 = getParentEdgeAt(2)->getDims();
+
+            if (inDims2[dim_idx] != outDims[dim_idx] && inDims2[dim_idx] != 1)
+                THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName();
+
+            int cOffset = 1;
+            for (int i = dim_idx + 1; i < nDims; i++)
+                cOffset *= inDims2[i];
+            cOffsets.push_back(inDims2[dim_idx] == outDims[dim_idx] ? cOffset : 0);
+        }
+
+        if ((inDims0[dim_idx] != outDims[dim_idx] && inDims0[dim_idx] != 1) ||
+            (inDims1[dim_idx] != outDims[dim_idx] && inDims1[dim_idx] != 1)) {
+            THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName();
+        }
+
+        int aOffset = 1;
+        for (int i = dim_idx + 1; i < nDims; i++)
+            aOffset *= inDims0[i];
+        aOffsets.push_back(inDims0[dim_idx] == outDims[dim_idx] ? aOffset : 0);
+
+        int bOffset = 1;
+        for (int i = dim_idx + 1; i < nDims; i++)
+            bOffset *= inDims1[i];
+        bOffsets.push_back(inDims1[dim_idx] == outDims[dim_idx] ? bOffset : 0);
+    }
+
+    for (unsigned long dim_idx = aOffsets.size(); dim_idx < 2; dim_idx++)
+        aOffsets.push_back(0);
+    for (unsigned long dim_idx = bOffsets.size(); dim_idx < 2; dim_idx++)
+        bOffsets.push_back(0);
+    for (unsigned long dim_idx = cOffsets.size(); dim_idx < 2; dim_idx++)
+        cOffsets.push_back(0);
+}
+
+void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+
+    auto same = [&] (memory::format fmt) -> PrimitiveDescInfo {
+        InferenceEngine::LayerConfig config;
+        config.dynBatchSupport = true;
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            InferenceEngine::DataConfig dataConfig;
+            dataConfig.inPlace = -1;
+            dataConfig.constant = false;
+            dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, fmt);
+            config.inConfs.push_back(dataConfig);
+        }
+
+        InferenceEngine::DataConfig dataConfig;
+            dataConfig.inPlace = -1;
+            dataConfig.constant = false;
+            dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
+            config.outConfs.push_back(dataConfig);
+        return {config, impl_desc_type::gemm_any};
+    };
+
+    supportedPrimitiveDescriptors.push_back(same(memory::any));
+}
+
+void MKLDNNGemmNode::createPrimitive() {
+    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
+    auto& src0MemPtr = getParentEdgeAt(0)->getMemoryPtr();
+    auto& src1MemPtr = getParentEdgeAt(1)->getMemoryPtr();
+    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
+        THROW_IE_EXCEPTION << "Destination memory isn't allocated.";
+    if (!src0MemPtr || !src0MemPtr->GetPrimitivePtr() || !src1MemPtr || !src1MemPtr->GetPrimitivePtr())
+        THROW_IE_EXCEPTION << "Input memory isn't allocated.";
+    if (getSelectedPrimitiveDescriptor() == nullptr)
+        THROW_IE_EXCEPTION << "Preferable primitive descriptor isn't set.";
+
+    if (isThreeInputs) {
+        auto& src2MemPtr = getParentEdgeAt(2)->getMemoryPtr();
+        if (!src2MemPtr || !src2MemPtr->GetPrimitivePtr())
+            THROW_IE_EXCEPTION << "Input memory isn't allocated.";
+    }
+}
+
+void MKLDNNGemmNode::execute(mkldnn::stream strm) {
+    auto inDims0 = getParentEdgeAt(0)->getDims();
+    auto inDims1 = getParentEdgeAt(1)->getDims();
+    auto outDims = getChildEdgeAt(0)->getDims();
+
+    auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
+    auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
+    const float *src0_ptr = reinterpret_cast<const float*>(srcMemory0.GetData()) +
+                            srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    const float *src1_ptr = reinterpret_cast<const float*>(srcMemory1.GetData()) +
+                            srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
+                     getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+    int MB1 = outDims.ndims() == 4 ? batchToProcess() : 1;
+    int MB2 = outDims.ndims() == 3 ? batchToProcess() : outDims.ndims() > 3 ? outDims[outDims.ndims() - 3] : 1;
+    int M = inDims0[yAxis];
+    int N = inDims1[xAxis];
+    int K = inDims0[xAxis];
+
+    const char transa = transposeA ? 'T' : 'N';
+    const char transb = transposeB ? 'T' : 'N';
+
+    int lda = transposeA ? M : K;
+    int ldb = transposeB ? K : N;
+    int ldc = N;
+
+    const float *src2_ptr;
+    if (isThreeInputs) {
+        auto& srcMemory2 = getParentEdgeAt(2)->getMemory();
+        src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetData()) +
+                                srcMemory2.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    } else {
+        src2_ptr = dst_ptr;
+    }
+
+    if (!isThreeInputs) {
+        beta = 0.f;
+    }
+
+    for (int b1 = 0; b1 < MB1; b1++) {
+        const float *a_ptr = src0_ptr;
+        const float *b_ptr = src1_ptr;
+        const float *c_ptr = src2_ptr;
+        float *d_ptr = dst_ptr;
+
+        for (int b2 = 0; b2 < MB2; b2++) {
+            if (isThreeInputs) {
+                memcpy(d_ptr, c_ptr, M * N * sizeof(float));
+                c_ptr += cOffsets[0];
+            }
+
+            mkldnn_sgemm(&transb, &transa, &N, &M, &K, &alpha, b_ptr, &ldb, a_ptr, &lda, &beta, d_ptr, &ldc);
+
+            a_ptr += aOffsets[0];
+            b_ptr += bOffsets[0];
+            d_ptr += M * N;
+        }
+
+        src0_ptr += aOffsets[1];
+        src1_ptr += bOffsets[1];
+        dst_ptr += MB2 * M * N;
+
+        if (isThreeInputs) {
+            src2_ptr += cOffsets[1];
+        }
+    }
+}
+
+bool MKLDNNGemmNode::created() const {
+    return getType() == Gemm;
+}
+
+int MKLDNNGemmNode::getMaxBatch() {
+    if (!outDims.empty())
+        return outDims[0][0];
+    return 0;
+}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h
new file mode 100644
index 000000000..da171a0da
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <vector>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNGemmNode : public MKLDNNNode {
+public:
+    MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng);
+    ~MKLDNNGemmNode() override = default;
+
+    void getSupportedDescriptors() override;
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override;
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+    int getMaxBatch() override;
+
+private:
+    static Register<MKLDNNGemmNode> reg;
+    float alpha;
+    float beta;
+    bool transposeA;
+    bool transposeB;
+
+    int xAxis;
+    int yAxis;
+
+    bool isThreeInputs;
+
+    std::vector<int> aOffsets;
+    std::vector<int> bOffsets;
+    std::vector<int> cOffsets;
+};
+
+}  // namespace MKLDNNPlugin
+
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
index 04cb400e1..b31b491e1 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -83,8 +82,7 @@ bool MKLDNNGenericNode::created(const MKLDNNExtensionManager::Ptr &extMgr) {
     if (getCnnLayer() && extMgr) {
         // We should save extension manager in otder to avoid situation when
         // it will destroyed before extensibility primitives
-        extensionManager = extMgr;
-        extFactory.reset(extensionManager->CreateExtensionFactory(getCnnLayer()));
+        extFactory.reset(extMgr->CreateExtensionFactory(getCnnLayer()));
 
         if (extFactory)
             setType(Generic);
@@ -147,11 +145,6 @@ void MKLDNNGenericNode::execLayer() {
     }
 }
 
-MKLDNNGenericNode::~MKLDNNGenericNode() {
-    extFactory.reset();
-    extensionManager.reset();
-}
-
 void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
     InferenceEngine::LayerConfig rightConfig = config;
     InferenceEngine::StatusCode rc;
@@ -206,11 +199,3 @@ void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &confi
         constant = ConstantType::Const;
     }
 }
-
-void MKLDNNGenericNode::initOptimalPrimitiveDescriptor() {
-    auto descriptor = getSelectedPrimitiveDescriptor();
-    if (descriptor != nullptr) {
-        auto config = descriptor->getConfig();
-        initDescriptor(config);
-    }
-}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
index 5cc8b0014..7bdd4a0f3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,7 +16,7 @@ namespace MKLDNNPlugin {
 class MKLDNNGenericNode : public MKLDNNNode {
 public:
     MKLDNNGenericNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng);
-    ~MKLDNNGenericNode() override;
+    ~MKLDNNGenericNode() = default;
 
     void getSupportedDescriptors() override;
     void initSupportedPrimitiveDescriptors() override;
@@ -30,7 +29,6 @@ public:
     }
 
     void initDescriptor(const InferenceEngine::LayerConfig& config) override;
-    void initOptimalPrimitiveDescriptor() override;
 
     void execLayer();
     void cleanup() override;
@@ -42,7 +40,6 @@ protected:
 
 private:
     static Register<MKLDNNGenericNode> reg;
-    MKLDNNExtensionManager::Ptr extensionManager;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
index 0a17a1442..9b42bee6b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
index 134ce8f61..99b4c8657 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
index 32594e315..4b1192b85 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
index b2a5c1829..9d85dabd3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
index b60177cb2..a37a2530b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
index 53ab16c39..ebc67748f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
index aa395a130..c23ce6ee5 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -69,6 +68,21 @@ void MKLDNNPermuteNode::initSupportedPrimitiveDescriptors() {
             config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
             supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
         }
+    } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
+        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ncdhw);
+        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ncdhw);
+        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+
+        auto srcDims = getParentEdgeAt(0)->getDims();
+        if (srcDims[1] % 8 == 0) {
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+        }
+
+        if (srcDims[1] % 16 == 0) {
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+        }
     } else {
         config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::any);
         config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
@@ -221,6 +235,70 @@ static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
     }
 }
 
+static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
+    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+    const int C  = srcMemPtr->GetDims()[1];
+    const int S  = srcMemPtr->GetDims()[2];
+
+    parallel_for2d(MB, S, [&](int n, int s) {
+        int src_off = 0;
+        int dst_off = 0;
+
+        for (int c = 0; c < C; c++) {
+            src_off = n * C * S +
+                      c * S +
+                      s;
+            dst_off = n * S * C +
+                      s * C +
+                      c;
+
+            dst_data[dst_off] = src_data[src_off];
+        }
+    });
+}
+
+static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
+    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+    const int DIM1 = srcMemPtr->GetDims()[1];
+    const int DIM2 = srcMemPtr->GetDims()[2];
+    const int DIM3 = srcMemPtr->GetDims()[3];
+    const int DIM4 = srcMemPtr->GetDims()[4];
+    const int DIM5 = srcMemPtr->GetDims()[5];
+
+    int src_off = 0;
+    int dst_off = 0;
+
+    for (int n = 0; n < MB; n++) {
+        for (int dim3 = 0; dim3 < DIM3; dim3++) {
+            for (int dim4 = 0; dim4 < DIM4; dim4++) {
+                for (int dim1 = 0; dim1 < DIM1; dim1++) {
+                    for (int dim5 = 0; dim5 < DIM5; dim5++) {
+                        for (int dim2 = 0; dim2 < DIM2; dim2++) {
+                            src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 +
+                                      dim1 * DIM2 * DIM3 * DIM4 * DIM5 +
+                                      dim2 * DIM3 * DIM4 * DIM5 +
+                                      dim3 * DIM4 * DIM5 +
+                                      dim4 * DIM5 +
+                                      dim5;
+
+                            dst_data[dst_off] = src_data[src_off];
+                            dst_off++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPermuteNode::OptimizedCases = {
         {{0, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_0231, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
             return true;
@@ -237,6 +315,12 @@ std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPerm
         {{0, 2, 1, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_0213, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
             return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
         })},  // shufflenet
+        {{0, 2, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_021, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+        })},  // self attention block
+        {{0, 3, 4, 1, 5, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_034152, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+        })},  // learning-to-see-in-the-dark-sony
 };
 
 void MKLDNNPermuteNode::execute(mkldnn::stream strm) {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
index 5b69b4475..9c0ce0d49 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
index 0ec7c0a26..82e3eac50 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <vector>
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
+#include <ie_layers_internal.hpp>
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -23,12 +23,8 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
         return;
 
     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
     precision = getCnnLayer()->outData[0]->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 
     auto * poolingLayer = dynamic_cast<PoolingLayer*>(getCnnLayer().get());
@@ -45,15 +41,16 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
 
     invertVectorCopyUtoI(poolingLayer->_stride, stride);
     invertVectorCopyUtoI(poolingLayer->_kernel, kernel);
-    invertVectorCopyUtoI(poolingLayer->_padding, paddingL);
-    invertVectorCopyUtoI(poolingLayer->_pads_end, paddingR);
+    auto allPads = getPaddings(*poolingLayer);
+    invertVectorCopyUtoI(allPads.begin, paddingL);
+    invertVectorCopyUtoI(allPads.end, paddingR);
 
     auto parentDims = getParentEdgeAt(0)->getDims();
     auto childDims = getChildEdgeAt(0)->getDims();
     if ((parentDims.ndims() < 4) || (parentDims.ndims() > 5))
         THROW_IE_EXCEPTION << "Pooling layer. Unsupported mode. Only 4D and 5D blobs are supported as input.";
 
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < paddingR.size(); i++) {
         int krn = kernel[i];
         int src = getParentEdgeAt(0)->getDims()[2 + i];
         int dst = getChildEdgeAt(0)->getDims()[2 + i];
@@ -61,11 +58,11 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
         int calc_dst = (src - krn + paddingL[i]) / stride[i] + 1;
         paddingR[i] = (dst - calc_dst) * stride[i];
     }
-
     if (this->getCnnLayer()->precision == Precision::I8) {
-        MKLDNNMemoryDesc in_candidate{parentDims, memory::data_type::u8, memory::format::nhwc};
-        MKLDNNMemoryDesc out_candidate{childDims, memory::data_type::u8, memory::format::nhwc};
-        createDescriptor({in_candidate}, {out_candidate});
+        // i8 layers supports only nhwc layout
+        MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, memory::format::nhwc};
+        MKLDNNMemoryDesc out_candidate{childDims, outputDataType, memory::format::nhwc};
+        createDescriptor({ in_candidate }, { out_candidate });
     } else {
         // It doesn't support any format
         for (auto format : getAvailableFormatsForDims(parentDims)) {
@@ -97,7 +94,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
 
     algorithm alg;
     if (type == PoolingLayer::PoolType::AVG) {
-        if (!exclude_pad && (paddingL[0] != 0 || paddingL[1] != 0))
+        bool not_zero_l = false;
+        for (auto lr : paddingL) {
+            if (lr) {
+                not_zero_l = true;
+                break;
+            }
+        }
+        if (!exclude_pad && not_zero_l)
             alg = pooling_avg_include_padding;
         else
             alg = pooling_avg_exclude_padding;
@@ -114,7 +118,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
                                       stride, kernel, paddingL, paddingR,
                                       mkldnn::padding_kind::zero));
 
-    if (alg == pooling_avg_include_padding && (paddingR[0] || paddingR[1])) {
+    bool not_zero_r = false;
+    for (auto pr : paddingR) {
+        if (pr) {
+            not_zero_r = true;
+            break;
+        }
+    }
+    if (alg == pooling_avg_include_padding && not_zero_r) {
         // In case of AVG including paddings the norm coeff should be calculated
         // with tacking into account original pads. So we need to restore
         // original values (R_padding = L_padding).
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
index 0af8a8ae5..e5309f494 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
index 360f3459c..01ae0e6fd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
index 370d694d9..a6fce5cbd 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
index 3b1678079..345b21536 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -71,6 +70,17 @@ void MKLDNNReorderNode::createPrimitive() {
     if (getSelectedPrimitiveDescriptor() == nullptr)
         THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
 
+    createReorderPrimitive(srcMemPtr->GetDescriptor(), srcMemPtr->GetPrimitive().get_data_handle(),
+            dstMemPtr->GetDescriptor(), dstMemPtr->GetPrimitive().get_data_handle());
+}
+
+void MKLDNNReorderNode::createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr) {
+    src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
+    src_blocked->Create(srcDesc, srcPtr);
+
+    dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
+    dst_blocked->Create(dstDesc, dstPtr);
+
     mkldnn::primitive_attr attr;
 
     if (_scales) {
@@ -90,52 +100,12 @@ void MKLDNNReorderNode::createPrimitive() {
         attr.set_int_output_round_mode(round_nearest);
     }
 
-    if (srcMemPtr->GetSize() == dstMemPtr->GetSize()) {
-        InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision();
-        InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision();
-
-        if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) {
-            // This reorder actually does nothing so we declare it in-place.
-            dstMemPtr->GetPrimitive().set_data_handle(srcMemPtr->GetPrimitive().get_data_handle());
-        } else {
-            try {
-                // No autoblocking. Reorder can be applied as is
-
-                reorder::primitive_desc pd = reorder::primitive_desc(srcMemPtr->GetPrimitiveDescriptor(), dstMemPtr->GetPrimitiveDescriptor(), attr);
-                prim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), dstMemPtr->GetPrimitive()));
-            } catch (...) {}
-        }
-    } else {
-        // Autoblocking case. nchw<=>nChw8c are only supported, but memory descriptor
-        // should be with strides. Prepare it from enlarged blob
-        memory::dims dims = srcMemPtr->GetDims();
-        memory::dims dims_dst = dstMemPtr->GetDims();
-
-        for (int i = 0; i < dims.size(); i++)  // min dims is a logical dims
-            dims[i] = std::min(dims[i], dims_dst[i]);
-
-        memory::desc src_d = srcMemPtr->GetDescriptor();
-        void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-
-        memory::desc dst_d = dstMemPtr->GetDescriptor();
-        void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-
-        for (int i = 0; i < dims.size(); i++)
-            src_d.data.dims[i] = dst_d.data.dims[i] = dims[i];
-
-        src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        src_blocked->Create(src_d, src_data_hdl);
-
-        dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        dst_blocked->Create(dst_d, dst_data_hdl);
-
-        // output blob should be zeroed. NaN value can occur in untouched place.
-        dstMemPtr->FillZero();
-
+    try {
+        // No autoblocking. Reorder can be applied as is
         reorder::primitive_desc pd = reorder::primitive_desc(src_blocked->GetPrimitiveDescriptor(), dst_blocked->GetPrimitiveDescriptor(), attr);
 
         prim.reset(new mkldnn::reorder(pd, src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
-    }
+    } catch (...) {}
 }
 
 const std::vector<impl_desc_type>& MKLDNNReorderNode::getPrimitivesPriority() {
@@ -148,32 +118,9 @@ bool MKLDNNReorderNode::created() const {
 }
 
 void MKLDNNReorderNode::execute(mkldnn::stream strm) {
-    if (prim) {
-        if (src_blocked)
-            src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        if (dst_blocked)
-            dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        MKLDNNNode::execute(strm);
-    } else {
-        InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision();
-        InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision();
-        if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) {
-            // Do nothing here
-        } else {
-            auto srcBlbPtr = getParentEdgeAt(0)->getBlob();
-            auto dstBlbPtr = getChildEdgeAt(0)->getBlob();
-
-            assert(srcBlbPtr->size() == dstBlbPtr->size());
-            int data_size = srcBlbPtr->size();
-
-            const auto* src_data = srcBlbPtr->cbuffer().as<const float *>();
-            auto* dst_data = dstBlbPtr->buffer().as<float *>();
-
-            InferenceEngine::parallel_for(data_size, [&](int i) {
-                dst_data[dstBlbPtr->getTensorDesc().offset(i)] = src_data[srcBlbPtr->getTensorDesc().offset(i)];
-            });
-        }
-    }
+    src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
+    dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
+    MKLDNNNode::execute(strm);
 }
 
 void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
@@ -186,21 +133,12 @@ void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
         void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
         void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
 
-        if (src_blocked && dst_blocked) {
-            src_d = src_blocked->GetDescriptor();
-            dst_d = dst_blocked->GetDescriptor();
-            src_data_hdl = src_blocked->GetPrimitive().get_data_handle();
-            dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle();
-        }
-        src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
         src_d.data.dims[0] = batchToProcess();
         src_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
-        src_blocked->Create(src_d, src_data_hdl);
 
-        dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
         dst_d.data.dims[0] = batchToProcess();
         dst_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
-        dst_blocked->Create(dst_d, dst_data_hdl);
-        prim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
+
+        createReorderPrimitive(src_d, src_data_hdl, dst_d, dst_data_hdl);
     }
 }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
index 3d74c2000..7a228ecec 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -51,6 +50,8 @@ private:
 
     MKLDNNMemoryPtr dst_blocked;
     MKLDNNMemoryPtr src_blocked;
+
+    void createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr);
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
index cfd51bf36..d959aa5f9 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,15 +48,6 @@ void MKLDNNReshapeNode::initSupportedPrimitiveDescriptors() {
     config.outConfs[0].constant = false;
     config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
-    if (inDims.ndims() == 4 && inDims[1] % 8 == 0 && outDims.ndims() == 4 &&outDims[1] % 8 == 0) {
-        outFormat = memory::format::any;
-    }
-    config.inConfs[0].inPlace = -1;
-    config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any);
-    config.outConfs[0].inPlace = -1;
-    config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
-
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 }
 
 void MKLDNNReshapeNode::createPrimitive() {
@@ -69,107 +59,6 @@ void MKLDNNReshapeNode::createPrimitive() {
         THROW_IE_EXCEPTION << "Input memory didn't allocate.";
     if (getSelectedPrimitiveDescriptor() == nullptr)
         THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
-
-    if (srcMemPtr->GetData() != dstMemPtr->GetData()) {
-        InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-        if (precision != InferenceEngine::Precision::FP32)
-            precision = InferenceEngine::Precision::FP32;
-        auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-        precision = getCnnLayer()->outData[0]->getPrecision();
-        if (precision != InferenceEngine::Precision::FP32)
-            precision = InferenceEngine::Precision::FP32;
-        auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
-        auto dims = getParentEdgeAt(0)->getDims();
-
-        srcMem.reset(new MKLDNNMemory(getEngine()));
-        srcMem->Create(dims, inputDataType, MKLDNNMemory::GetPlainFormat(dims));
-
-        dstMem.reset(new MKLDNNMemory(getEngine()));
-        dstMem->Create(getChildEdgeAt(0)->getDims(), outputDataType,
-                       MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()), srcMem->GetData());
-
-        if (srcMemPtr->GetSize() == srcMem->GetSize()) {
-            srcPrim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), srcMem->GetPrimitive()));
-        } else {
-            // Autoblocking mode
-            memory::dims dims = srcMem->GetDims();  // contains logical dims
-
-            memory::desc src_d = srcMemPtr->GetDescriptor();
-            void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-
-            for (int i = 0; i < dims.size(); i++)
-                src_d.data.dims[i] =  dims[i];
-
-            memory::primitive_desc tmp_src_pd(src_d, getEngine());
-            src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-            src_blocked->Create(src_d, src_data_hdl);
-
-            srcPrim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), srcMem->GetPrimitive()));
-        }
-
-        if (dstMemPtr->GetSize() == dstMem->GetSize()) {
-            dstPrim.reset(new mkldnn::reorder(dstMem->GetPrimitive(), dstMemPtr->GetPrimitive()));
-        } else {
-            // Autoblocking mode
-            memory::dims dims = srcMem->GetDims();
-
-            memory::desc dst_d = dstMemPtr->GetDescriptor();
-            void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-
-            for (int i = 0; i < dims.size(); i++)
-                dst_d.data.dims[i] =  dims[i];
-
-            dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-            dst_blocked->Create(dst_d, dst_data_hdl);
-
-            dstPrim.reset(new mkldnn::reorder(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive()));
-        }
-    }
-}
-
-void MKLDNNReshapeNode::setDynamicBatchLim(int lim) {
-    dynBatchLim = lim;
-    if (srcPrim && dstPrim) {
-        auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-        auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
-        memory::desc src_d = srcMemPtr->GetDescriptor();
-        memory::desc dst_d = dstMemPtr->GetDescriptor();
-        void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-        void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-        srcMem = std::make_shared<MKLDNNMemory>(getEngine());
-        src_d.data.dims[0] = batchToProcess();
-        srcMem->Create(src_d, src_data_hdl);
-        dstMemPtr = std::make_shared<MKLDNNMemory>(getEngine());
-        src_d.data.dims[0] = batchToProcess();
-        dstMemPtr->Create(src_d, src_data_hdl);
-
-        if (src_blocked && dst_blocked) {
-            src_d = src_blocked->GetDescriptor();
-            dst_d = dst_blocked->GetDescriptor();
-            src_data_hdl = src_blocked->GetPrimitive().get_data_handle();
-            dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle();
-        }
-        src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        src_d.data.dims[0] = batchToProcess();
-        src_blocked->Create(src_d, src_data_hdl);
-
-        dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        dst_d.data.dims[0] = batchToProcess();
-        dst_blocked->Create(dst_d, dst_data_hdl);
-        srcPrim = std::make_shared<mkldnn::reorder>(src_blocked->GetPrimitive(), srcMem->GetPrimitive());
-        dstPrim = std::make_shared<mkldnn::reorder>(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive());
-    }
-}
-
-void MKLDNNReshapeNode::execute(mkldnn::stream strm) {
-    if (srcPrim && dstPrim) {
-        if (src_blocked)
-            src_blocked->GetPrimitive().set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        if (dst_blocked)
-            dst_blocked->GetPrimitive().set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        strm.submit({*srcPrim, *dstPrim});
-    }
 }
 
 bool MKLDNNReshapeNode::created() const {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
index eeb666008..bb30099c9 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -21,19 +20,10 @@ public:
     void getSupportedDescriptors() override;
     void initSupportedPrimitiveDescriptors() override;
     void createPrimitive() override;
-    void execute(mkldnn::stream strm) override;
     bool created() const override;
-    void setDynamicBatchLim(int lim) override;
 
 private:
     static Register<MKLDNNReshapeNode> reg;
-    std::shared_ptr<mkldnn::primitive> srcPrim;
-    std::shared_ptr<mkldnn::primitive> dstPrim;
-    MKLDNNMemoryPtr srcMem;
-    MKLDNNMemoryPtr dstMem;
-
-    MKLDNNMemoryPtr dst_blocked;
-    MKLDNNMemoryPtr src_blocked;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
index a474ca926..ba3228543 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
@@ -1,12 +1,11 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_rnn.h"
 #include "mkldnn_extension_utils.h"
 #include "desc_iterator.hpp"
-#include <ie_layers.h>
+#include <ie_layers_prv.h>
 
 #include <string>
 #include <utility>
@@ -16,39 +15,143 @@ using namespace InferenceEngine;
 
 namespace MKLDNNPlugin {
 
-MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
+template <typename T, typename P>
+inline bool one_of(T val, P item) { return val == item; }
+template <typename T, typename P, typename... Args>
+inline bool one_of(T val, P item, Args... item_others) {
+    return val == item || one_of(val, item_others...);
+}
+
+rnn_direction ie2mkl(RNNLayer::Direction &direction) {
+    return direction == RNNLayer::RNN_FWD ? unidirectional_left2right
+         : direction == RNNLayer::RNN_BWD ? unidirectional_right2left
+         : direction == RNNLayer::RNN_BDR ? bidirectional_concat
+                                          : unidirectional;
+}
+
+MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {
+    is_cell = layer->type == "LSTMCell";
+}
 
 bool MKLDNNRNN::created() const {
-    return getType() == RNN;
+    return getType() == (is_cell ? LSTMCell : RNN);
 }
 
 void MKLDNNRNN::getSupportedDescriptors() {
+    if (is_cell)
+        fillCellDesc();
+    else
+        fillSeqDesc();
+}
+
+void MKLDNNRNN::fillCellDesc() {
+    if (!descs.empty()) return;
+    auto cellLayer = std::dynamic_pointer_cast<InferenceEngine::LSTMCell>(getCnnLayer());
+
+    if (!cellLayer)
+        THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer.";
+
+    auto &ins = cellLayer->insData;
+    auto &outs = cellLayer->outData;
+
+    if (ins.size() != 3)
+        THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName();
+    if (outs.size() != 2)
+        THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName();
+
+    auto in_data_dims = getParentEdgeAt(0)->getDims();
+    auto in_h_state_dims = getParentEdgeAt(1)->getDims();
+    auto in_c_state_dims = getParentEdgeAt(2)->getDims();
+
+    auto out_h_state_dims = getChildEdgeAt(0)->getDims();
+    auto out_c_state_dims = getChildEdgeAt(1)->getDims();
+
+    if (in_data_dims.ndims() != 2
+        || in_h_state_dims.ndims() != 2
+        || in_c_state_dims.ndims() != 2
+        || out_h_state_dims.ndims() != 2
+        || out_c_state_dims.ndims() != 2)
+        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+
+    T = 1;
+    N  = in_data_dims[0];
+    DC = in_data_dims[1];
+    SC = in_h_state_dims[1];
+
+    // Expected shapes
+    MKLDNNDims D_shape {N, DC}, S_shape {N, SC};
+
+    if (in_data_dims != D_shape
+        || in_h_state_dims != S_shape
+        || in_c_state_dims != S_shape
+        || out_h_state_dims != S_shape
+        || out_c_state_dims != S_shape)
+        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+
+    auto blobs = cellLayer->blobs;
+    Blob::Ptr weights, bias;
+    if (blobs.find("weights") != blobs.end()) weights = blobs["weights"];
+    if (blobs.find("biases") != blobs.end()) bias = blobs["biases"];
+
+    if (!weights)
+        THROW_IE_EXCEPTION << "RNN Layer. Weights do not present.";
+
+    if (weights->size() != G*SC*(SC+DC))
+        THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);
+
+    if (bias && bias->size() != G*SC)
+        THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
+
+    // Shapes and Attributes are correct. Can start internal stuff initialization.
+
+    in_state_d  = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+    out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+
+    in_data_d  = {{T, N, DC}, memory::f32, memory::tnc};;
+    out_data_d = {{T, N, SC}, memory::f32, memory::tnc};;
+
+    w_data_d   = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
+    w_state_d  = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
+
+    if (bias)
+        w_bias_d = {{L, D, G, SC}, memory::f32, memory::ldgo};
+
+    std::vector<TensorDesc> in_candidate;
+    in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc});
+    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+
+    std::vector<TensorDesc> out_candidate;
+    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+
+    createDescriptor(in_candidate, out_candidate);
+}
+
+void MKLDNNRNN::fillSeqDesc() {
     if (!descs.empty()) return;
     auto rnnLayer = std::dynamic_pointer_cast<RNNLayer>(getCnnLayer());
 
     if (!rnnLayer)
         THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer.";
 
-    if (rnnLayer->cellType == LSTM)
-        cellr_type = LSTM;
-    else
+    if (!one_of(rnnLayer->cellType, "LSTM"))
         THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell";
 
-    swap_state = rnnLayer->params["swap_state"] == "YES";
+    if (!one_of(rnnLayer->axis, 0, 1))
+        THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1";
+    nativeOrder = rnnLayer->axis == 0;
 
-    if (rnnLayer->_axis == 0)
-        nativeOrder = true;
-    else if (rnnLayer->_axis == 1)
-        nativeOrder = false;
-    else
-        THROW_IE_EXCEPTION << "RNN layer supports only sequence axis == 1";
+    if (!one_of(rnnLayer->direction, RNNLayer::RNN_FWD, RNNLayer::RNN_BWD))
+        THROW_IE_EXCEPTION << "RNN layer supports only unidirectional RNN layer";
+    direction = ie2mkl(rnnLayer->direction);
 
     auto &ins = rnnLayer->insData;
     auto &outs = rnnLayer->outData;
 
-    if (ins.size() != 3 && ins.size() != 1)
+    if (!one_of(ins.size(), 3, 1))
         THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName();
-    if (outs.size() != 3 && outs.size() !=1)
+    if (!one_of(outs.size(), 3, 1))
         THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName();
 
     auto in_data_dims = getParentEdgeAt(0)->getDims();
@@ -62,31 +165,21 @@ void MKLDNNRNN::getSupportedDescriptors() {
         std::swap(out_data_dims[0], out_data_dims[1]);
     }
 
-    // IE specific order
-    seq       = in_data_dims[0];
-    batch     = in_data_dims[1];
-    data_len  = in_data_dims[2];
-    state_len = out_data_dims[2];
-
-    const int N = batch;
-    const int T = seq;
-    const int G = num_gates;
-    const int DC = data_len;
-    const int SC = state_len;
-    const int L = 1;  // What is a L ??
-    const int D = 1;
-    const int S = 2;
-
-    if (out_data_dims != MKLDNNDims {T, N, SC})
-        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+    T = in_data_dims[0];
+    N = in_data_dims[1];
+    DC = in_data_dims[2];
+    SC = out_data_dims[2];
 
-    MKLDNNDims state_dims {batch, state_len};
+    MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC};
+
+    if (out_data_dims != OD_shape)
+        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
 
     if (ins.size() == 3) {
         auto state_dims1 = getParentEdgeAt(1)->getDims();
         auto stats_dims2 = getParentEdgeAt(2)->getDims();
 
-        if (state_dims1 != state_dims || stats_dims2 != state_dims)
+        if (state_dims1 != S_shape || stats_dims2 != S_shape)
             THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
 
         in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
@@ -96,7 +189,7 @@ void MKLDNNRNN::getSupportedDescriptors() {
         auto state_dims1 = getChildEdgeAt(1)->getDims();
         auto stats_dims2 = getChildEdgeAt(2)->getDims();
 
-        if (state_dims1 != state_dims || stats_dims2 != state_dims)
+        if (state_dims1 != S_shape || stats_dims2 != S_shape)
             THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
 
         out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
@@ -133,8 +226,8 @@ void MKLDNNRNN::getSupportedDescriptors() {
         in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc});
 
     if (ins.size() == 3) {
-        in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
-        in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
+        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
     }
 
     std::vector<TensorDesc> out_candidate;
@@ -144,8 +237,8 @@ void MKLDNNRNN::getSupportedDescriptors() {
         out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc});
 
     if (outs.size() == 3) {
-        out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
-        out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
+        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
     }
 
     createDescriptor(in_candidate, out_candidate);
@@ -156,7 +249,7 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
     MKLDNNDescriptor desc(std::shared_ptr<rnn_forward::desc>(
             new rnn_forward::desc(forward_scoring,
                     {algorithm::vanilla_lstm, algorithm::eltwise_tanh },
-                    unidirectional,
+                    direction,
                     /* In Data       */ in_data_d,
                     /* In State      */ in_state_d,
                     /* Weights data  */ w_data_d,
@@ -194,13 +287,8 @@ void MKLDNNRNN::createPrimitive() {
     std::shared_ptr<rnn_forward::desc> d = descs[0];
     rnn_forward::primitive_desc pd(*d, getEngine());
 
-    auto src_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    src_data_mem->Create(in_data_d, getParentEdgeAt(0)->getMemoryPtr()->GetData());
-    internalBlobMemory.push_back(src_data_mem);
-
-    auto dst_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    dst_data_mem->Create(out_data_d, getChildEdgeAt(0)->getMemoryPtr()->GetData());
-    internalBlobMemory.push_back(dst_data_mem);
+    auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
+    auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
 
     // create weight blobs (data and state part)
     auto w_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
@@ -229,28 +317,27 @@ void MKLDNNRNN::createPrimitive() {
          *
          *   Gate order
          *   Caffe - IFOC, ONNX   - IOFC
-         *   IE    - FICO, mkldnn - FIOC
-         *
+         *   IE    - FICO, mkldnn - IFCO
          */
-        // FICO -> FIOC
-        const int gate_map[] = {0, 1, 3, 2};
+        // FICO -> IFCO
+        const int gate_map[] = {1, 0, 2, 3};
 
         auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as<const float*>();
         auto w_ptr = static_cast<float*>(w_data_mem->GetData());
         auto r_ptr = static_cast<float*>(w_state_mem->GetData());
-        const int step = state_len * num_gates;
+        const int step = SC * G;
 
-        for (int g = 0; g < num_gates; g++) {
-            for (int out_i = 0; out_i < state_len; out_i++) {
-                float *l_w_ptr = w_ptr + gate_map[g]*state_len + out_i;
-                float *l_r_ptr = r_ptr + gate_map[g]*state_len + out_i;
-                for (int in_i = 0; in_i < data_len; in_i++) {
+        for (int g = 0; g < G; g++) {
+            for (int out_i = 0; out_i < SC; out_i++) {
+                float *l_w_ptr = w_ptr + gate_map[g]*SC + out_i;
+                float *l_r_ptr = r_ptr + gate_map[g]*SC+ out_i;
+                for (int in_i = 0; in_i < DC; in_i++) {
                     *l_w_ptr = *ie_w_ptr;
                     ie_w_ptr++;
                     l_w_ptr += step;
                 }
 
-                for (int in_i = 0; in_i < state_len; in_i++) {
+                for (int in_i = 0; in_i < SC; in_i++) {
                     *l_r_ptr = *ie_w_ptr;
                     ie_w_ptr++;
                     l_r_ptr += step;
@@ -261,9 +348,9 @@ void MKLDNNRNN::createPrimitive() {
         if (w_bias_d) {
             auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as<const float*>();
             auto b_ptr = static_cast<float*>(w_bias_mem->GetData());
-            for (int g = 0; g < num_gates; g++) {
-                float *l_b_ptr = b_ptr + gate_map[g]*state_len;
-                for (int out_i = 0; out_i < state_len; out_i++) {
+            for (int g = 0; g < G; g++) {
+                float *l_b_ptr = b_ptr + gate_map[g]*SC;
+                for (int out_i = 0; out_i < SC; out_i++) {
                     *l_b_ptr = *ie_b_ptr;
                     ie_b_ptr++;
                     l_b_ptr++;
@@ -293,37 +380,35 @@ void MKLDNNRNN::createPrimitive() {
                 src_stat_1.get_primitive_desc().get_size());
         internalBlobMemory.push_back(high_half_state_mem);
 
-        if (!swap_state) {
-            exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive());
-            exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive());
-        } else {
-            exec_before.emplace_back(src_stat_2, low_half_state_mem->GetPrimitive());
-            exec_before.emplace_back(src_stat_1, high_half_state_mem->GetPrimitive());
-        }
+        exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive());
+        exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive());
     }
 
     auto dst_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
     dst_state_mem->Create(out_state_d);
     internalBlobMemory.push_back(dst_state_mem);
     if (out_state_d) {
+        int idx_H = is_cell ? 0 : 1;
+        int idx_C = is_cell ? 1 : 2;
         /* create copy/split primitive */
-        auto dst_stat_1 = getChildEdgeAt(1)->getMemory().GetPrimitive();
-        auto dst_stat_2 = getChildEdgeAt(2)->getMemory().GetPrimitive();
+        auto dst_stat_1 = getChildEdgeAt(idx_H)->getMemory().GetPrimitive();
+        auto dst_stat_2 = getChildEdgeAt(idx_C)->getMemory().GetPrimitive();
 
         auto low_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
         low_half_state_mem->Create(
                 dst_stat_1.get_primitive_desc().desc(),
-                src_state_mem->GetPrimitive().get_data_handle());
+                dst_state_mem->GetPrimitive().get_data_handle());
         internalBlobMemory.push_back(low_half_state_mem);
 
         auto high_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
         high_half_state_mem->Create(
                 dst_stat_2.get_primitive_desc().desc(),
-                static_cast<uint8_t*>(src_state_mem->GetPrimitive().get_data_handle()) +
+                static_cast<uint8_t*>(dst_state_mem->GetPrimitive().get_data_handle()) +
                         dst_stat_1.get_primitive_desc().get_size());
         internalBlobMemory.push_back(high_half_state_mem);
 
-        exec_after.emplace_back(low_half_state_mem->GetPrimitive(),  dst_stat_1);
+
+        if (!is_cell) exec_after.emplace_back(low_half_state_mem->GetPrimitive(),  dst_stat_1);
         exec_after.emplace_back(high_half_state_mem->GetPrimitive(), dst_stat_2);
     }
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
index a47fdf41c..4399c306a 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -28,18 +27,30 @@ public:
     void execute(mkldnn::stream strm) override;
 
 private:
+    void fillCellDesc();
+    void fillSeqDesc();
+
+private:
     static Register<MKLDNNRNN> reg;
 
-    InferenceEngine::CellType cellr_type = InferenceEngine::CellType::LSTM;
+    /** Specify mode Cell or Seq. true - Cell, false - Seq */
+    bool is_cell = false;
+
     /** Native order if [batch, seq, data], other case is [seq, batch, data] */
     bool nativeOrder = true;
-    bool swap_state = false;
 
-    int batch = 0;
-    int seq = 0;
-    int data_len = 0;
-    int state_len = 0;
-    const size_t num_gates = 4;
+    /** Direction of iteration through sequence dimension */
+    mkldnn::rnn_direction direction = mkldnn::unidirectional;
+
+    // Internal attributes
+    int N = 0;   /**< Batch value */
+    int T = 0;   /**< Sequence value */
+    int DC = 0;  /**< Input data channel size */
+    int SC = 0;  /**< State channel size value */
+    const int G = 4;   /**< Gate size. 4 for LSTM */
+    const int L = 1;   /**< What is it??. Constant for mkldnn impl */
+    const int D = 1;   /**< Num of direction. 1 or 2 */
+    const int S = 2;   /**< Num of state. 2 for LSTM (hidden and sell state). */
 
     MKLDNNMemoryDesc in_data_d;
     MKLDNNMemoryDesc out_data_d;
@@ -51,6 +62,7 @@ private:
     MKLDNNMemoryDesc w_state_d;
     MKLDNNMemoryDesc w_bias_d;
 
+    // List of in/out reorders if required
     std::vector<mkldnn::reorder> exec_before;
     std::vector<mkldnn::reorder> exec_after;
 };
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
index 7d76243f9..4088a1f7a 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
index 401a1c7d3..ca2bafd4f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
index 0738f0054..752172733 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
index 792a634c9..8e199f377 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
index 618479c22..90cf4f401 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -24,16 +23,15 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
     if (splitLayer == nullptr)
         THROW_IE_EXCEPTION << "Cannot convert split layer.";
 
-    axis = splitLayer->_axis;
-
-    if (axis != 1)
-        THROW_IE_EXCEPTION << "Split support only axis 1.";
-
     if (getParentEdges().size() != 1)
         THROW_IE_EXCEPTION << "Incorrect number of input nodes.";
     if (getChildEdges().empty())
         THROW_IE_EXCEPTION << "Incorrect number of output nodes.";
 
+    axis = splitLayer->_axis;
+    if (axis >= getParentEdgeAt(0)->getDims().ndims())
+        THROW_IE_EXCEPTION << "Invalid value of axis parameter in split layer";
+
     // WA. Check applicability and limitations
     for (size_t i = 1; i < getCnnLayer()->outData.size(); i++) {
         int num_port_connection = getCnnLayer()->outData[i]->inputTo.size();
@@ -72,7 +70,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
     if (srcDims.ndims() < 2)
         THROW_IE_EXCEPTION << "Split " << getName() << " isn't supported 1d blobs";
 
-    auto num_chanels = 0;
+    auto axis_size = 0;
     auto dstFirstDims = getChildEdgeAt(0)->getDims();
     for (size_t i = 0; i < outDims.size(); i++) {
         auto o_Dims = outDims[i];
@@ -83,15 +81,15 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
         config.outConfs[i].inPlace = -1;
         config.outConfs[i].constant = false;
         config.outConfs[i].desc = MKLDNNMemoryDesc(o_Dims, outputDataType, memory::format::any);
-        num_chanels += o_Dims[1];
+        axis_size += o_Dims[axis];
         for (size_t j = 0; j < dstFirstDims.ndims(); j++) {
             if (j == axis)
                 continue;
             if (o_Dims[j] != dstFirstDims[j])
-                THROW_IE_EXCEPTION << "Split " << getName() << "has incorrect output dimensions";
+                THROW_IE_EXCEPTION << "Split " << getName() << " has incorrect output dimensions";
         }
     }
-    dstFirstDims[1] = num_chanels;
+    dstFirstDims[axis] = axis_size;
     if (dstFirstDims.size() != srcDims.size())
         THROW_IE_EXCEPTION << "The sizes of input blob and sum of output blobs are not equal.";
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
@@ -99,11 +97,10 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
     auto numOfDim = static_cast<size_t>(srcDims.ndims());
 
     SizeVector order;
-    SizeVector offsets;
+    SizeVector offsets(numOfDim, 0lu);
     size_t offset = std::numeric_limits<size_t>::max();
     for (size_t i = 0; i < numOfDim; i++) {
         order.push_back(i);
-        offsets.push_back(0);
     }
 
     SizeVector strides(numOfDim);
@@ -125,23 +122,23 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
     }
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 
-    if (numOfDim != 4)
+    if ((numOfDim != 4 && numOfDim != 5) || axis != 1)
         return;
 
-    order = {0, 1, 2, 3, 1};
-    offsets = {0, 0, 0, 0, 0};
-    numOfDim = 5;
+    order.push_back(1);
+    numOfDim = order.size();
+    offsets = SizeVector(numOfDim, 0lu);
 
     // nChw8c and nChw16c
-    for (int sizeS : {8, 16}) {
+    for (size_t sizeS : {8lu, 16lu}) {
         SizeVector blkDims = srcDims.ToSizeVector();
         if (blkDims[1] % sizeS)
             continue;
-        blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+        blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
         blkDims.push_back(sizeS);
 
         strides.resize(numOfDim);
-        strides[numOfDim - 1] = 1;
+        strides[numOfDim - 1] = 1lu;
         for (size_t i = 2; i <= numOfDim; i++) {
             if (numOfDim - i < axis) {
                 strides[numOfDim - i] = std::numeric_limits<size_t>::max();
@@ -160,9 +157,9 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
                 canInplace = false;
                 break;
             }
-            blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+            blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
             blkDims.push_back(sizeS);
-            config.outConfs[i].desc =  TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides});
+            config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides});
         }
         if (canInplace)
             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
@@ -190,18 +187,32 @@ void MKLDNNSplitNode::execute(mkldnn::stream strm) {
     int MB = batchToProcess();
     auto srcBlob = getParentEdgeAt(0)->getBlob();
     const auto *srcData = srcBlob->cbuffer().as<const float *>();
+
+    size_t outerSize = 1;
+    for (int i = 0; i < axis; i++) {
+        if (i == 0)
+            outerSize *= MB;
+        else
+            outerSize *= srcBlob->dims()[srcBlob->dims().size() - i - 1];
+    }
+
     size_t srcSize = getParentEdgeAt(0)->getMemory().GetSize();
-    size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / srcBlob->getTensorDesc().getDims()[0])
+    size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / outerSize)
             - srcBlob->getTensorDesc().offset(0);
 
     for (size_t i = 0, sIdx = 0; i < getChildEdges().size(); i++) {
         auto dstBlob = getChildEdgeAt(i)->getBlob();
         auto *dstData = dstBlob->buffer().as<float *>();
-        size_t dst_slice_size = dstBlob->size() / dstBlob->getTensorDesc().getDims()[0];
-        size_t dst_batch_off = dstBlob->getTensorDesc().offset(dst_slice_size) - dstBlob->getTensorDesc().offset(0);
 
-        for (size_t dIdx = 0; dIdx < dst_slice_size; dIdx++, sIdx++) {
-            for (unsigned b = 0; b < MB; b++) {
+        size_t innerSize = 1;
+        for (size_t j = axis; j < dstBlob->dims().size(); j++) {
+            innerSize *= dstBlob->dims()[dstBlob->dims().size() - j - 1];
+        }
+
+        size_t dst_batch_off = dstBlob->getTensorDesc().offset(innerSize) - dstBlob->getTensorDesc().offset(0);
+
+        for (size_t dIdx = 0; dIdx < innerSize; dIdx++, sIdx++) {
+            for (unsigned b = 0; b < outerSize; b++) {
                 if (sIdx + b*src_batch_off >= srcSize)
                     THROW_IE_EXCEPTION << "Incorrect configuration of split layer " << getName() << "!";
                 dstData[b * dst_batch_off + dstBlob->getTensorDesc().offset(dIdx)] =
@@ -436,3 +447,13 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() {
     }
     initDescriptor(config);
 }
+
+void MKLDNNSplitNode::setDynamicBatchLim(int lim) {
+    if (axis == 0)
+        THROW_IE_EXCEPTION << "Dynamic batch is not supported by split layer with axis == 0 parameter";
+
+    dynBatchLim = lim;
+    if (prim) {
+        prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
+    }
+}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
index 7d4157768..905f8069c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -26,6 +25,8 @@ public:
     bool isOptimized();
     void initOptimalPrimitiveDescriptor() override;
 
+    void setDynamicBatchLim(int lim) override;
+
 private:
     static Register<MKLDNNSplitNode> reg;
     size_t axis = 1;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
index 204ea868d..122671681 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,9 +48,11 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() {
         fmt = memory::format::nc;
     } else if (inDims.ndims() == 4) {
         fmt = memory::format::nchw;
+    } else if (inDims.ndims() == 5) {
+        fmt = memory::format::ncdhw;
     }
     if (fmt == memory::format::any) {
-        THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2d and 4d dimensions!";
+        THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2D, 4D and 5D dimensions!";
     }
 
     InferenceEngine::LayerConfig config;
@@ -101,14 +102,16 @@ void MKLDNNTileNode::execute(mkldnn::stream strm) {
         m_inner_dim *= batchToProcess();
     }
 
-    if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%8 == 0 && srcMemory.GetFormat() == memory::nChw8c) {
+    if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw8c) ||
+            (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw8c))) {
         /*
          * We may enable tile processing directly to appropriate output format (nChw8c)
          */
         m_inner_dim *= 8;
         m_outer_dim /= 8;
-    } else if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%16 == 0
-               && srcMemory.GetFormat() == memory::nChw16c) {
+    } else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 &&
+            ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw16c) ||
+            (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw16c))) {
         /*
          * We may enable tile processing directly to appropriate output format (nChw16c)
          */
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
index 464c15017..d6a75941f 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/perf_count.h b/inference-engine/src/mkldnn_plugin/perf_count.h
index 87f0c5ffc..3770a2435 100644
--- a/inference-engine/src/mkldnn_plugin/perf_count.h
+++ b/inference-engine/src/mkldnn_plugin/perf_count.h
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
new file mode 100644
index 000000000..24d2931af
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
@@ -0,0 +1,370 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "blob_dump.h"
+#include "blob_factory.hpp"
+#include "mkldnn_memory.h"
+
+// It's so bad to include by relative path :-(
+#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
+
+#include <fstream>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+// IEB file format routine
+static unsigned char IEB_MAGIC[4] = {'I', 'E', 'B', '0'};
+static unsigned char NO_SCALES = 0xFF;
+
+struct IEB_HEADER {
+    unsigned char magic[4];
+    unsigned char ver[2];
+
+    unsigned char precision;  // 0-8
+    unsigned char ndims;
+    unsigned int  dims[7];  // max is 7-D blob
+
+    unsigned char scaling_axis;  // FF - no scaling
+    unsigned char reserved[3];
+
+    unsigned long data_offset;
+    unsigned long data_size;
+    unsigned long scaling_data_offset;
+    unsigned long scaling_data_size;
+};
+
+static IEB_HEADER prepare_header(const TensorDesc& desc) {
+    IEB_HEADER header;
+
+    header.magic[0] = IEB_MAGIC[0];
+    header.magic[1] = IEB_MAGIC[1];
+    header.magic[2] = IEB_MAGIC[2];
+    header.magic[3] = IEB_MAGIC[3];
+
+    // IEB file format version 0.1
+    header.ver[0] = 0;
+    header.ver[1] = 1;
+
+    header.precision = desc.getPrecision();
+
+    if (desc.getDims().size() > 7)
+        THROW_IE_EXCEPTION << "Dumper support max 7D blobs";
+
+    header.ndims = desc.getDims().size();
+    for (int i = 0; i < header.ndims; i++)
+        header.dims[i] = desc.getDims()[i];
+
+    header.scaling_axis = NO_SCALES;
+
+    return header;
+}
+
+static TensorDesc parse_header(IEB_HEADER &header) {
+    if (header.magic[0] != IEB_MAGIC[0] ||
+        header.magic[1] != IEB_MAGIC[1] ||
+        header.magic[2] != IEB_MAGIC[2] ||
+        header.magic[3] != IEB_MAGIC[3])
+        THROW_IE_EXCEPTION << "Dumper cannot parse file. Wrong format.";
+
+    if (header.ver[0] != 0 ||
+        header.ver[1] != 1)
+        THROW_IE_EXCEPTION << "Dumper cannot parse file. Unsupported IEB format version.";
+
+    Precision prc = Precision(static_cast<Precision::ePrecision>(header.precision));
+    SizeVector dims(header.ndims);
+    for (int i = 0; i < header.ndims; i++)
+        dims[i] = header.dims[i];
+
+    return TensorDesc {prc, dims, plain_layout(dims)};
+}
+
+
+bool is_plain(Blob::Ptr blob) {
+    bool res = true;
+
+    auto orig_strides = blob->getTensorDesc().getBlockingDesc().getStrides();
+    auto orig_order = blob->getTensorDesc().getBlockingDesc().getOrder();
+    auto dims = blob->getTensorDesc().getDims();
+
+    for (int stride = 1, i = dims.size()-1; i >= 0; --i) {
+        if (stride != orig_strides[i] || i != orig_order[i]) res = false;
+        stride *= dims[i];
+    }
+
+    return res;
+}
+
+static Blob::Ptr prepare_plain_data(Blob::Ptr blob) {
+    // check if it already plain
+    if (is_plain(blob)) return blob;
+
+    Blob::Ptr pln_blob = make_plain_blob(blob->precision(), blob->getTensorDesc().getDims());
+    pln_blob->allocate();
+
+    // Copy to plain
+    MKLDNNMemoryDesc mdesc(blob->getTensorDesc());
+    mkldnn::memory::desc desc = mdesc;
+    mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data);
+
+    int data_size = blob->size();
+
+    // TODO: make it with blob_copy utility
+    switch (blob->precision()) {
+        case Precision::FP32:
+        case Precision::I32: {
+            int32_t *pln_blob_ptr = pln_blob->buffer().as<int32_t*>();
+            int32_t *blob_ptr = blob->buffer().as<int32_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            break;
+        }
+        case Precision::I16:
+        case Precision::U16: {
+            int16_t *pln_blob_ptr = pln_blob->buffer().as<int16_t*>();
+            int16_t *blob_ptr = blob->buffer().as<int16_t *>();
+            for (size_t i = 0; i < data_size; i++)
+                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            break;
+        }
+        case Precision::I8:
+        case Precision::U8: {
+            int8_t *pln_blob_ptr = pln_blob->buffer().as<int8_t*>();
+            int8_t *blob_ptr = blob->buffer().as<int8_t *>();
+            for (size_t i = 0; i < data_size; i++)
+                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            break;
+        }
+        default:
+            THROW_IE_EXCEPTION << "Dumper. Unsupported precision";
+    }
+
+    return pln_blob;
+}
+
+void BlobDumper::dump(std::ostream &stream) {
+    if (!_blob)
+        THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob";
+
+    if (_blob->buffer().as<float*>() == nullptr)
+        THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated.";
+
+    IEB_HEADER header = prepare_header(_blob->getTensorDesc());
+    Blob::Ptr pln_blob = prepare_plain_data(_blob);
+
+    header.data_offset = sizeof(header);
+    header.data_size = pln_blob->byteSize();
+    header.scaling_data_offset = 0;
+    header.scaling_data_size = 0;
+
+    if (_scales) {
+        header.scaling_axis = 1;
+        header.scaling_data_offset = header.data_offset + header.data_size;
+        header.scaling_data_size = _scales->byteSize();
+    }
+
+    stream.write(reinterpret_cast<char*>(&header), sizeof(header));
+    stream.write(pln_blob->buffer().as<char*>(), pln_blob->byteSize());
+
+    if (_scales) {
+        stream.write(_scales->buffer().as<char*>(), _scales->byteSize());
+    }
+}
+
+void BlobDumper::dumpAsTxt(std::ostream &stream) {
+    if (!_blob)
+        THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob";
+
+    if (_blob->buffer().as<float*>() == nullptr)
+        THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated.";
+
+    SizeVector dims = _blob->getTensorDesc().getDims();
+
+    // Header like "U8 4D shape: 2 3 224 224 ()
+    stream << _blob->precision().name() << " "
+           << dims.size() << "D "
+           << "shape: ";
+    for (size_t d : dims) stream << d << " ";
+    stream << "(" << _blob->size() << ")" <<std::endl;
+
+    // Dump data
+    MKLDNNMemoryDesc mdesc(_blob->getTensorDesc());
+    mkldnn::memory::desc desc = mdesc;
+    mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data);
+
+    int data_size = _blob->size();
+    switch (_blob->precision()) {
+        case Precision::FP32: {
+            auto *blob_ptr = _blob->buffer().as<float*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
+            break;
+        }
+        case Precision::I32: {
+            auto *blob_ptr = _blob->buffer().as<int32_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
+            break;
+        }
+        case Precision::I16: {
+            auto *blob_ptr = _blob->buffer().as<int16_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        case Precision::U16: {
+            auto *blob_ptr = _blob->buffer().as<uint16_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        case Precision::I8: {
+            auto *blob_ptr = _blob->buffer().as<int8_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        case Precision::U8: {
+            auto *blob_ptr = _blob->buffer().as<uint8_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        default:
+            THROW_IE_EXCEPTION << "Dumper. Unsupported precision";
+    }
+}
+
+BlobDumper BlobDumper::read(std::istream &stream) {
+    IEB_HEADER header;
+    stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+
+    TensorDesc desc = parse_header(header);
+    Blob::Ptr blob = make_blob_with_precision(desc);
+    blob->allocate();
+
+    stream.seekg(header.data_offset, stream.beg);
+    stream.read(blob->buffer().as<char*>(), header.data_size);
+
+    BlobDumper res(blob);
+
+    // Parse scales fields.
+    if (header.scaling_axis != NO_SCALES) {
+        if (header.scaling_axis != 1)
+            THROW_IE_EXCEPTION << "Dumper support scaling only for channel dims.";
+
+        size_t scl_size = header.scaling_data_size / sizeof(float);
+        auto scl = make_blob_with_precision({Precision::FP32, {scl_size}, C});
+        scl->allocate();
+
+        stream.seekg(header.scaling_data_offset, stream.beg);
+        stream.read(scl->buffer().as<char*>(), header.scaling_data_size);
+
+        res._scales = scl;
+    }
+    return res;
+}
+
+BlobDumper BlobDumper::read(const std::string &file_path) {
+    std::ifstream file;
+    file.open(file_path);
+    if (!file.is_open())
+        THROW_IE_EXCEPTION << "Dumper cannot open file " << file_path;
+
+    auto res = read(file);
+    file.close();
+    return res;
+}
+
+void BlobDumper::dump(const std::string &dump_path) {
+    std::ofstream dump_file;
+    dump_file.open(dump_path);
+    if (!dump_file.is_open())
+        THROW_IE_EXCEPTION << "Dumper cannot create dump file";
+
+    dump(dump_file);
+    dump_file.close();
+}
+
+void BlobDumper::dumpAsTxt(const std::string dump_path) {
+    std::ofstream dump_file;
+    dump_file.open(dump_path);
+    if (!dump_file.is_open())
+        THROW_IE_EXCEPTION << "Dumper cannot create dump file";
+
+    dumpAsTxt(dump_file);
+    dump_file.close();
+}
+
+Blob::Ptr BlobDumper::get() {
+    return _blob;
+}
+
+template <typename data_t>
+static void plain_copy(const Blob::Ptr &from, const Blob::Ptr &scls, Blob::Ptr &to) {
+    auto dims = from->getTensorDesc().getDims();
+
+    size_t data_size = from->size();
+    size_t outer_size = dims[0];
+    size_t c_size = dims.size() > 1 ? dims[1] : 1;
+    size_t inner_size = dims.size() == 4 ? dims[2]*dims[3] :
+                        dims.size() == 3 ? dims[2] : 1;
+
+    auto to_data  = to->buffer().as<float*>();
+    auto from_data = from->buffer().as<data_t*>();
+
+    if (scls) {
+        auto scls_data = scls->buffer().as<float*>();
+
+        for (size_t o=0; o < outer_size; o++)
+        for (size_t c=0; c < c_size; c++)
+        for (size_t i=0; i < inner_size; i++)
+            *to_data++ = static_cast<float>(*from_data++) * scls_data[c];
+    } else {
+        for (size_t i=0; i < data_size; i++)
+            *to_data++ = static_cast<float>(*from_data++);
+    }
+}
+
+Blob::Ptr BlobDumper::getRealValue() {
+    if (_blob->precision() == Precision::FP32 && !_scales)
+        return _blob;
+
+    auto res = make_plain_blob(Precision::FP32, _blob->getTensorDesc().getDims());
+    res->allocate();
+
+    switch (_blob->precision()) {
+        case Precision::U8: plain_copy<uint8_t>(_blob, _scales, res); break;
+        case Precision::FP32: plain_copy<float>(_blob, _scales, res); break;
+        case Precision::I8: plain_copy<int8_t >(_blob, _scales, res); break;
+        default: THROW_IE_EXCEPTION << "Unsupported precesion for getRealValue method.";
+    }
+
+    return res;
+}
+
+
+BlobDumper& BlobDumper::withScales(InferenceEngine::Blob::Ptr scales) {
+    if ( _blob->getTensorDesc().getDims().size() < 2  ||
+        scales->getTensorDesc().getDims().size() != 1 ||
+        scales->getTensorDesc().getDims()[0] != _blob->getTensorDesc().getDims()[1] ||
+        scales->getTensorDesc().getPrecision() != Precision::FP32)
+        THROW_IE_EXCEPTION << "Dumper cannot use passed scales. Blob has incompatible shape.";
+
+    _scales = scales;
+    return *this;
+}
+
+BlobDumper& BlobDumper::withoutScales() {
+    _scales.reset();
+    return *this;
+}
+
+
+const InferenceEngine::Blob::Ptr& BlobDumper::getScales() const {
+    return _scales;
+}
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.h b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h
new file mode 100644
index 000000000..4130d53a7
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_blob.h"
+
+#include <string>
+
+namespace MKLDNNPlugin {
+
+/**
+ * Utility class to dump blob contant in plain format.
+ * Every layout information will be lost.
+ *
+ * In case of low precision blob it allow to store
+ * with using scaling factors per channel.
+ * NB! Channel is a second dimension for all blob types.
+ */
+class BlobDumper {
+    InferenceEngine::Blob::Ptr _blob;
+    InferenceEngine::Blob::Ptr _scales;
+
+public:
+    BlobDumper() = default;
+    BlobDumper(const BlobDumper&) = default;
+    BlobDumper& operator = (BlobDumper&&) = default;
+
+    explicit BlobDumper(const InferenceEngine::Blob::Ptr blob):_blob(blob) {}
+
+    static BlobDumper read(const std::string &file_path);
+    static BlobDumper read(std::istream &stream);
+
+    void dump(const std::string &file_path);
+    void dump(std::ostream &stream);
+
+    void dumpAsTxt(const std::string file_path);
+    void dumpAsTxt(std::ostream &stream);
+
+    BlobDumper& withScales(InferenceEngine::Blob::Ptr scales);
+    BlobDumper& withoutScales();
+
+    const InferenceEngine::Blob::Ptr& getScales() const;
+
+    InferenceEngine::Blob::Ptr get();
+    InferenceEngine::Blob::Ptr getRealValue();
+};
+
+}  // namespace MKLDNNPlugin
author	Alexey Suhov <asuhov@users.noreply.github.com>	2019-01-21 21:31:31 +0300
committer	openvino-pushbot <44090433+openvino-pushbot@users.noreply.github.com>	2019-01-21 21:31:31 +0300
commit	9de27f16bc8b712a5b8c99d1d4b4a66c9144942d (patch)
tree	01a383efe94d92b9870d513c2c5ea5d15b07010a /inference-engine/src/mkldnn_plugin
parent	fbc7a4a710c24def8ab199926a7da90a0394b87d (diff)
download	dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.gz dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.bz2 dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.zip