diff options
author | Alexey Suhov <asuhov@users.noreply.github.com> | 2019-01-21 21:31:31 +0300 |
---|---|---|
committer | openvino-pushbot <44090433+openvino-pushbot@users.noreply.github.com> | 2019-01-21 21:31:31 +0300 |
commit | 9de27f16bc8b712a5b8c99d1d4b4a66c9144942d (patch) | |
tree | 01a383efe94d92b9870d513c2c5ea5d15b07010a /inference-engine/src/mkldnn_plugin | |
parent | fbc7a4a710c24def8ab199926a7da90a0394b87d (diff) | |
download | dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.gz dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.tar.bz2 dldt-9de27f16bc8b712a5b8c99d1d4b4a66c9144942d.zip |
Publishing R5 content (#72)
* Publishing R5 content
* Updated ade revision
* updated readme
* add possibility to build CPU plugin with Intel MKL package
Diffstat (limited to 'inference-engine/src/mkldnn_plugin')
97 files changed, 2794 insertions, 1568 deletions
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt index 79551f636..5997f7d4b 100644 --- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt +++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt @@ -1,6 +1,7 @@ # Copyright (C) 2018 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # + set(TARGET_NAME "MKLDNNPlugin") if (UNIX AND NOT APPLE) @@ -25,9 +26,7 @@ file(GLOB HEADERS addVersionDefines(mkldnn_plugin.cpp CI_BUILD_NUMBER MKL_VERSION) -if(WIN32) - add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN) -endif() +add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN) include_directories( ${IE_MAIN_SOURCE_DIR}/include @@ -38,39 +37,30 @@ include_directories( ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include ) +if (GEMM STREQUAL "MKL") + log_rpath_from_dir(MKL "${MKL}/lib") +endif() + add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS}) +set_ie_threading_interface_for(${TARGET_NAME}) if (THREADING STREQUAL "TBB") - target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB) - target_include_directories(${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS}) - target_link_libraries(${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE}) + set(MKLDNN_THR MKLDNN_THR_TBB) elseif (THREADING STREQUAL "OMP") - target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP) - enable_omp() - if(ENABLE_INTEL_OMP) - target_link_libraries(${TARGET_NAME} ${intel_omp_lib}) - endif() + set(MKLDNN_THR MKLDNN_THR_OMP) else() - target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ) + set(MKLDNN_THR MKLDNN_THR_SEQ) endif() -target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} mkldnn) +target_compile_definitions(${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR}) +target_link_libraries(${TARGET_NAME} PRIVATE inference_engine ${INTEL_ITT_LIBS} mkldnn) + set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) add_library(test_${TARGET_NAME} STATIC ${SOURCES} ${HEADERS}) +set_ie_threading_interface_for(test_${TARGET_NAME}) -if (THREADING STREQUAL "TBB") - target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB) - target_include_directories(test_${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS}) - target_link_libraries(test_${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE}) -elseif (THREADING STREQUAL "OMP") - target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP) - if(ENABLE_INTEL_OMP) - target_link_libraries(test_${TARGET_NAME} ${intel_omp_lib}) - endif() -else() - target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ) -endif() +target_compile_definitions(test_${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR}) +target_link_libraries(test_${TARGET_NAME} PRIVATE inference_engine_s mkldnn) -target_link_libraries(test_${TARGET_NAME} inference_engine_s mkldnn) set_target_properties(test_${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME test_${TARGET_NAME}) diff --git a/inference-engine/src/mkldnn_plugin/config.cpp b/inference-engine/src/mkldnn_plugin/config.cpp index 57c8dc9c8..4ef10eec2 100644 --- a/inference-engine/src/mkldnn_plugin/config.cpp +++ b/inference-engine/src/mkldnn_plugin/config.cpp @@ -1,16 +1,23 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // +// avoiding clash of the "max" macro with std::max +#define NOMINMAX + #include "config.h" #include "ie_plugin_config.hpp" #include "ie_common.h" #include <string> +#include <cstring> #include <map> #include <algorithm> +#include <stdexcept> + #include <cpp_interfaces/exception2status.hpp> +#include <thread> +#include "mkldnn/omp_manager.h" namespace MKLDNNPlugin { @@ -44,6 +51,42 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) { else THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS << ". Expected only YES/NO"; + } else if (key == PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) { + if (val == PluginConfigParams::CPU_THROUGHPUT_NUMA) { + throughputStreams = MKLDNNPlugin::cpu::getNumberOfCPUSockets(); + } else if (val == PluginConfigParams::CPU_THROUGHPUT_AUTO) { + // bare minimum of streams (that evenly divides available number of core) + const int num_cores = std::thread::hardware_concurrency(); + if (0 == num_cores % 4) + throughputStreams = std::max(4, num_cores / 4); + else if (0 == num_cores % 5) + throughputStreams = std::max(5, num_cores / 5); + else if (0 == num_cores % 3) + throughputStreams = std::max(3, num_cores / 3); + else // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide + throughputStreams = 1; + } else { + int val_i; + try { + val_i = std::stoi(val); + } catch (const std::exception&) { + THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS + << ". Expected only positive numbers (#streams) or " + << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO"; + } + if (val_i > 0) + throughputStreams = val_i; + } + } else if (key == PluginConfigParams::KEY_CPU_THREADS_NUM) { + int val_i; + try { + val_i = std::stoi(val); + } catch (const std::exception&) { + THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THREADS_NUM + << ". Expected only positive numbers (#threads)"; + } + if (val_i > 0) + threadsNum = val_i; } else if (key.compare(PluginConfigParams::KEY_DYN_BATCH_ENABLED) == 0) { if (val.compare(PluginConfigParams::YES) == 0) enableDynamicBatch = true; @@ -52,10 +95,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) { else THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_DYN_BATCH_ENABLED << ". Expected only YES/NO"; + } else if (key.compare(PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT) == 0) { + // empty string means that dumping is switched off + dumpToDot = val; } else { THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property " << key << " by CPU plugin"; } } + if (exclusiveAsyncRequests) // Exclusive request feature disables the streams + throughputStreams = 1; } } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/config.h b/inference-engine/src/mkldnn_plugin/config.h index 0bb390c51..558ac87ae 100644 --- a/inference-engine/src/mkldnn_plugin/config.h +++ b/inference-engine/src/mkldnn_plugin/config.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +14,10 @@ struct Config { bool collectPerfCounters = false; bool exclusiveAsyncRequests = false; bool enableDynamicBatch = false; + std::string dumpToDot = ""; int batchLimit = 0; + int throughputStreams = 1; + int threadsNum = 0; void readProperties(const std::map<std::string, std::string> &config); }; diff --git a/inference-engine/src/mkldnn_plugin/mean_image.cpp b/inference-engine/src/mkldnn_plugin/mean_image.cpp index ff87e1466..f1ac17e9a 100644 --- a/inference-engine/src/mkldnn_plugin/mean_image.cpp +++ b/inference-engine/src/mkldnn_plugin/mean_image.cpp @@ -1,10 +1,10 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // #include "mean_image.h" #include "ie_parallel.hpp" +#include "ie_memcpy.h" using namespace MKLDNNPlugin; using namespace InferenceEngine; @@ -54,7 +54,8 @@ void MeanImage::Load(const MKLDNNDims& inputDims, InputInfo::Ptr inputInfo) { THROW_IE_EXCEPTION << "mean image size does not match expected network input, expecting " << meanWidth << " x " << meanHeight; } // todo: cast to TBlob and make sure it is floats - memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBlob->buffer(), meanBlob->byteSize()); + ie_memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBuffer->byteSize() - channel*meanBlob->byteSize(), + meanBlob->buffer(), meanBlob->byteSize()); } } break; diff --git a/inference-engine/src/mkldnn_plugin/mean_image.h b/inference-engine/src/mkldnn_plugin/mean_image.h index c27d667e2..24dc8163a 100644 --- a/inference-engine/src/mkldnn_plugin/mean_image.h +++ b/inference-engine/src/mkldnn_plugin/mean_image.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h index 3bfae03a0..09ec76c42 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h index 0643d99a7..b3ad3c0c5 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h index 9cc57f3df..616f517aa 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp index d0b911799..57b6edc35 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -13,10 +12,6 @@ namespace mkldnn { -template <> struct handle_traits<mkldnn_primitive_desc_iterator_t> { - static constexpr auto destructor = &mkldnn_primitive_desc_iterator_destroy; -}; - struct primitive_desc_iterator : public handle<mkldnn_primitive_desc_iterator_t> { template <typename T> primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr, const engine &aengine) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp index 834f8bd6e..ff3616a44 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -33,6 +32,7 @@ impl_desc_type MKLDNNPlugin::parse_impl_name(std::string impl_desc_name) { res = static_cast<impl_desc_type>(res | impl_desc_type::_key); SEARCH_WORD_2(nchw, ref); + SEARCH_WORD_2(ncdhw, ref); SEARCH_WORD_2(wino, winograd); #undef SEARCH_WORD_2 diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h index 75a618927..45cca0402 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp new file mode 100644 index 000000000..19bc513f6 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <cstdlib> +#include <cstring> +#include "ie_parallel.hpp" +#include "omp_manager.h" + +using namespace MKLDNNPlugin; +namespace MKLDNNPlugin { +namespace cpu { + +static const char *openMpEnvVars[] = { + "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC", + "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED", + "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE", + "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY", + "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS", + "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY", + "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY", + "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS", + "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS" +}; + +static const unsigned numberOfOpenMpEnvVars = + sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]); + +bool checkOpenMpEnvVars(bool includeOMPNumThreads) { + for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) { + if (getenv(openMpEnvVars[i])) { + if (0 != strcmp(openMpEnvVars[i], "OMP_NUM_THREADS") || includeOMPNumThreads) + return true; + } + } + return false; +} + +#if !(defined(__APPLE__) || defined(_WIN32)) +// getNumberOfCPUSockets/getNumberOfCPUCores are implemented in the lin_omp_manager.cpp +#else +int getNumberOfCPUSockets() {return 1;} +int getNumberOfCPUCores() {return parallel_get_max_threads();} +#endif + +} // namespace cpu +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h index 26cba003e..65cc216e4 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -9,10 +8,15 @@ */ #pragma once -#ifdef _WIN32 - #include "mkldnn/os/win/win_omp_manager.h" -#elif defined(__APPLE__) - #include "mkldnn/os/osx/osx_omp_manager.h" -#else - #include "mkldnn/os/lin/lin_omp_manager.h" -#endif +namespace MKLDNNPlugin { +namespace cpu { + +bool checkOpenMpEnvVars(bool includeOMPNumThreads = true); +// numbers of CPU sockets in the machine (on Linux), 1 on all other OSes +int getNumberOfCPUSockets(); +// numbers of CPU physical cores on Linux (which is considered to be more performance friendly for servers) +// (on other OSes it simply relies on the original parallel API of choice, which usually use the logical cores ) +int getNumberOfCPUCores(); + +} // namespace cpu +} // namespace MKLDNNPlugin
\ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp index 75f2e4c6b..14c3e1d1d 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp @@ -1,10 +1,8 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // #include "lin_omp_manager.h" -#include "ie_parallel.hpp" #include <fstream> #include <set> #include <string> @@ -19,20 +17,13 @@ namespace cpu { Processor::Processor() { processor = 0; physicalId = 0; - siblings = 0; - coreId = 0; cpuCores = 0; - speedMHz = 0; } CpuInfo::CpuInfo() { loadContentFromFile("/proc/cpuinfo"); } -CpuInfo::CpuInfo(const char *content) { - loadContent(content); -} - void CpuInfo::loadContentFromFile(const char *fileName) { std::ifstream file(fileName); std::string content( @@ -98,10 +89,6 @@ Collection::Collection(CpuInfoInterface *cpuInfo) : cpuInfo(*cpuInfo) { collectBasicCpuInformation(); } -unsigned Collection::getProcessorSpeedMHz() { - return processors.size() ? processors[0].speedMHz : 0; -} - unsigned Collection::getTotalNumberOfSockets() { return totalNumberOfSockets; } @@ -114,10 +101,6 @@ unsigned Collection::getNumberOfProcessors() { return processors.size(); } -const Processor &Collection::getProcessor(unsigned processorId) { - return processors[processorId]; -} - void Collection::parseCpuInfo() { const char *cpuInfoLine = cpuInfo.getFirstLine(); for (; cpuInfoLine; cpuInfoLine = cpuInfo.getNextLine()) { @@ -148,21 +131,9 @@ void Collection::parseValue(const char *fieldName, const char *valueString) { currentProcessor->physicalId = parseInteger(valueString); } - if (beginsWith(fieldName, "siblings")) { - currentProcessor->siblings = parseInteger(valueString); - } - - if (beginsWith(fieldName, "core id")) { - currentProcessor->coreId = parseInteger(valueString); - } - if (beginsWith(fieldName, "cpu cores")) { currentProcessor->cpuCores = parseInteger(valueString); } - - if (beginsWith(fieldName, "model name")) { - currentProcessor->speedMHz = extractSpeedFromModelName(valueString); - } } void Collection::appendNewProcessor() { @@ -184,32 +155,6 @@ unsigned Collection::parseInteger(const char *text) const { return atol(text); } -/* Function extracts CPU speed from model name. If unit is not set it is - assumed that values below 100 are specified in GHz, otherwise MHz */ -unsigned Collection::extractSpeedFromModelName(const char *text) const { - text = strstr(text, "@"); - if (!text) { - return 0; - } - - char *unit; - double speed = strtod(&text[1], &unit); - - while (isspace(*unit)) { - unit++; - } - - bool isMHz = !strncmp(unit, "MHz", 3); - bool isGHz = !strncmp(unit, "GHz", 3); - bool isGHzPossible = (speed < 100); - - if (isGHz || (isGHzPossible && !isMHz)) { - return 1000 * speed + 0.5; - } else { - return speed + 0.5; - } -} - void Collection::collectBasicCpuInformation() { std::set<unsigned> uniquePhysicalId; std::vector<Processor>::iterator processor = processors.begin(); @@ -229,120 +174,27 @@ void Collection::updateCpuInformation(const Processor &processor, totalNumberOfCpuCores += processor.cpuCores; } - -/* The OpenMpManager class is responsible for determining a set of all of - available CPU cores and delegating each core to perform other tasks. The - first of available cores is delegated for background threads, while other - remaining cores are dedicated for OpenMP threads. Each OpenMP thread owns - one core for exclusive use. The number of OpenMP threads is then limited - to the number of available cores minus one. The amount of CPU cores may - be limited by system eg. when numactl was used. */ #include <sched.h> -static const char *openMpEnvVars[] = { - "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC", - "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED", - "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE", - "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY", - "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS", - "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY", - "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY", - "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS", - "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS" -}; - -static const unsigned numberOfOpenMpEnvVars = - sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]); - -OpenMpManager::OpenMpManager(Collection *collection) : - collection(*collection), isGpuEnabled(false) { - getOpenMpEnvVars(); - getCurrentCpuSet(); - getCurrentCoreSet(); -} - -OpenMpManager &OpenMpManager::getInstance() { +int getNumberOfCPUSockets() { static CpuInfo cpuInfo; static Collection collection(&cpuInfo); - static OpenMpManager openMpManager(&collection); - return openMpManager; -} - -void OpenMpManager::setGpuEnabled() { - OpenMpManager &openMpManager = getInstance(); - openMpManager.isGpuEnabled = true; -} - -void OpenMpManager::setGpuDisabled() { - OpenMpManager &openMpManager = getInstance(); - openMpManager.isGpuEnabled = false; -} - -// Ideally bind given thread to secondary logical core, if -// only one thread exists then bind to primary one -void OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible() { - OpenMpManager &openMpManager = getInstance(); - if (openMpManager.isThreadsBindAllowed()) { - int totalNumberOfAvailableCores = CPU_COUNT(&openMpManager.currentCoreSet); - int logicalCoreToBindTo = totalNumberOfAvailableCores > 1 ? 1 : 0; - openMpManager.bindCurrentThreadToLogicalCoreCpus(logicalCoreToBindTo); - } + return collection.getTotalNumberOfSockets(); } -void OpenMpManager::bindOpenMpThreads(int env_cores) { - OpenMpManager &openMpManager = getInstance(); - - if (!openMpManager.isThreadsBindAllowed()) - return; - - openMpManager.setOpenMpThreadNumberLimit(env_cores); - InferenceEngine::parallel_nt(0, [&] (unsigned logicalCoreId, int nthr) { - openMpManager.bindCurrentThreadToLogicalCoreCpu(logicalCoreId); - }); -} - -int OpenMpManager::getOpenMpThreadNumber() { - OpenMpManager &openMpManager = getInstance(); - - return openMpManager.getCoreNumber(); -} - - -void OpenMpManager::getOpenMpEnvVars() { - isAnyOpenMpEnvVarSpecified = false; - for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) { - if (getenv(openMpEnvVars[i])) { - isAnyOpenMpEnvVarSpecified = true; - } - } -} - -void OpenMpManager::getCurrentCpuSet() { - if (sched_getaffinity(0, sizeof(currentCpuSet), ¤tCpuSet)) { - getDefaultCpuSet(¤tCpuSet); - } -} - -void OpenMpManager::getDefaultCpuSet(cpu_set_t *defaultCpuSet) { - CPU_ZERO(defaultCpuSet); - unsigned numberOfProcessors = collection.getNumberOfProcessors(); - for (int processorId = 0; processorId < numberOfProcessors; processorId++) { - CPU_SET(processorId, defaultCpuSet); - } -} - -/* Function getCurrentCoreSet() fills currentCoreSet variable with a set of - available CPUs, where only one CPU per core is chosen. When multiple CPUs - of single core are used, function is selecting only first one of all - available. */ -void OpenMpManager::getCurrentCoreSet() { +int getNumberOfCPUCores() { + static CpuInfo cpuInfo; + static Collection collection(&cpuInfo); unsigned numberOfProcessors = collection.getNumberOfProcessors(); unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores(); - cpu_set_t usedCoreSet; + cpu_set_t usedCoreSet, currentCoreSet, currentCpuSet; + CPU_ZERO(¤tCpuSet); CPU_ZERO(&usedCoreSet); CPU_ZERO(¤tCoreSet); + sched_getaffinity(0, sizeof(currentCpuSet), ¤tCpuSet); + for (int processorId = 0; processorId < numberOfProcessors; processorId++) { if (CPU_ISSET(processorId, ¤tCpuSet)) { unsigned coreId = processorId % totalNumberOfCpuCores; @@ -352,70 +204,9 @@ void OpenMpManager::getCurrentCoreSet() { } } } -} - -void OpenMpManager::selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId) { - unsigned numberOfProcessors = collection.getNumberOfProcessors(); - unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores(); - - int processorId = physicalCoreId % totalNumberOfCpuCores; - while (processorId < numberOfProcessors) { - if (CPU_ISSET(processorId, ¤tCpuSet)) { - CPU_SET(processorId, set); - } - - processorId += totalNumberOfCpuCores; - } -} - -unsigned OpenMpManager::getPhysicalCoreId(unsigned logicalCoreId) { - unsigned numberOfProcessors = collection.getNumberOfProcessors(); - - for (int processorId = 0; processorId < numberOfProcessors; processorId++) { - if (CPU_ISSET(processorId, ¤tCoreSet)) { - if (!logicalCoreId--) { - return processorId; - } - } - } - - std::cerr << "This should never happen!"; - return 0; -} - -bool OpenMpManager::isThreadsBindAllowed() { - return !isAnyOpenMpEnvVarSpecified && !isGpuEnabled; -} - -// Limit of threads to number of logical cores available -void OpenMpManager::setOpenMpThreadNumberLimit(int env_cores) { - parallel_set_num_threads(env_cores == 0 ? CPU_COUNT(¤tCoreSet) : 0); -} - -int OpenMpManager::getCoreNumber() { return CPU_COUNT(¤tCoreSet); } -void OpenMpManager::bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId) { - unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId); -#if IE_THREAD == IE_THREAD_OMP - cpu_set_t set; - CPU_ZERO(&set); - CPU_SET(physicalCoreId, &set); - sched_setaffinity(0, sizeof(set), &set); -#endif -} - -void OpenMpManager::bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId) { - unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId); -#if IE_THREAD == IE_THREAD_OMP - cpu_set_t set; - CPU_ZERO(&set); - selectAllCoreCpus(&set, physicalCoreId); - sched_setaffinity(0, sizeof(set), &set); -#endif -} - #endif // #ifndef APPLE } // namespace cpu } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h index d39329a6b..dfd69bbb4 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -20,10 +19,7 @@ namespace cpu { struct Processor { unsigned processor; unsigned physicalId; - unsigned siblings; - unsigned coreId; unsigned cpuCores; - unsigned speedMHz; Processor(); }; @@ -41,8 +37,6 @@ class CpuInfo : public CpuInfoInterface { public: CpuInfo(); - explicit CpuInfo(const char *content); - virtual ~CpuInfo(); virtual const char *getFirstLine(); @@ -64,32 +58,17 @@ private: class CollectionInterface { public: virtual ~CollectionInterface() {} - - virtual unsigned getProcessorSpeedMHz() = 0; - virtual unsigned getTotalNumberOfSockets() = 0; - - virtual unsigned getTotalNumberOfCpuCores() = 0; - - virtual unsigned getNumberOfProcessors() = 0; - - virtual const Processor &getProcessor(unsigned processorId) = 0; }; class Collection : public CollectionInterface { public: explicit Collection(CpuInfoInterface *cpuInfo); - virtual unsigned getProcessorSpeedMHz(); - virtual unsigned getTotalNumberOfSockets(); - virtual unsigned getTotalNumberOfCpuCores(); - virtual unsigned getNumberOfProcessors(); - virtual const Processor &getProcessor(unsigned processorId); - private: CpuInfoInterface &cpuInfo; unsigned totalNumberOfSockets; @@ -113,70 +92,11 @@ private: unsigned parseInteger(const char *text) const; - unsigned extractSpeedFromModelName(const char *text) const; - void collectBasicCpuInformation(); void updateCpuInformation(const Processor &processor, unsigned numberOfUniquePhysicalId); }; - - -class OpenMpManager { -public: - static void setGpuEnabled(); - - static void setGpuDisabled(); - - static void bindCurrentThreadToNonPrimaryCoreIfPossible(); - - static void bindOpenMpThreads(int env_cores = 0); - - static int getOpenMpThreadNumber(); - - static void printVerboseInformation(); - - static bool isMajorThread(int currentThread); - -private: - Collection &collection; - - bool isGpuEnabled; - bool isAnyOpenMpEnvVarSpecified; - cpu_set_t currentCpuSet; - cpu_set_t currentCoreSet; - - explicit OpenMpManager(Collection *collection); - - OpenMpManager(const OpenMpManager &openMpManager); - - OpenMpManager &operator=(const OpenMpManager &openMpManager); - - static OpenMpManager &getInstance(); - - void getOpenMpEnvVars(); - - void getCurrentCpuSet(); - - int getCoreNumber(); - - void getDefaultCpuSet(cpu_set_t *defaultCpuSet); - - void getCurrentCoreSet(); - - void selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId); - - unsigned getPhysicalCoreId(unsigned logicalCoreId); - - bool isThreadsBindAllowed(); - - void setOpenMpThreadNumberLimit(int env_cores); - - void bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId); - - void bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId); -}; - #endif // #ifndef __APPLE__ } // namespace cpu } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h deleted file mode 100644 index 0484bb571..000000000 --- a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -/** -* @brief WINAPI based code -* @file win_omp_manager.h -*/ - -#pragma once - -#include <thread> -#include <vector> - -namespace MKLDNNPlugin { -namespace cpu { - -class OpenMpManager { -public: - static int getOpenMpThreadNumber() { - return getCoreNumber(); - } - - static int getCoreNumber() { - return 4; - } -}; - -} // namespace cpu -} // namespace MKLDNNPlugin - diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h deleted file mode 100644 index d59891679..000000000 --- a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (C) 2018 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -/** -* @brief WINAPI based code -* @file win_omp_manager.h -*/ - -#pragma once - -#include <thread> -#include <vector> -#include <windows.h> - -namespace MKLDNNPlugin { -namespace cpu { - -class OpenMpManager { -public: - static int getOpenMpThreadNumber() { - return getCoreNumber(); - } - - static int getCoreNumber() { - int num_cores = std::thread::hardware_concurrency(); - unsigned long size = 0; - - if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size)) { - if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { - std::vector<char> buf(size); - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info - = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(&buf.front()); - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* ptr = info; - if (GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size)) { - if (GetLastError() == ERROR_SUCCESS) { - int num = 0; - unsigned long offset = 0; - while (offset < size) { - num++; - offset += ptr->Size; - ptr = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>( - reinterpret_cast<byte*>(ptr) + ptr->Size); - } - num_cores = num; - } - } - } - } - return num_cores; - } -}; - -} // namespace cpu -} // namespace MKLDNNPlugin - diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp index e117182dd..ea463a2d5 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h index be18a41e2..447787f88 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp index 9ea3fe38c..bcb47419e 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h index 51a29c21e..dff072089 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h index f707f268c..06616a8be 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp index 102955fbb..92c8c5ad3 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h index 91c586b4a..f5364f614 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp index c9ca08ab2..b362433eb 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h index bd1e0d8d5..f3abd8b4a 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp index 681061a4d..3600ee56c 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h index fb7953f17..8b2994e5f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp index 983fc2b35..9c079efd4 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +10,7 @@ #include <limits> #include <fstream> #include <unordered_map> +#include <memory> #include "details/caseless.hpp" #include "mkldnn_graph.h" @@ -24,7 +24,6 @@ #include "mkldnn_extension_utils.h" #include "mkldnn_extension_mngr.h" #include "mkldnn/omp_manager.h" -#include "ie_parallel.hpp" #include <graph_tools.hpp> #include <cpp_interfaces/ie_executor_manager.hpp> #include "ie_algorithm.hpp" @@ -33,21 +32,34 @@ #include "mkldnn_async_infer_request.h" #include <blob_factory.hpp> #include <ie_util_internal.hpp> +#include <net_pass.h> + +#include <mkldnn_graph_dumper.h> #include <data_stats.h> -#include "../inference_engine/cnn_network_int8_normalizer.hpp" +#include "cnn_network_int8_normalizer.hpp" +#include "ie_memcpy.h" #define XBYAK_NO_OP_NAMES #define XBYAK_UNDEF_JNL #include "../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h" #include "cnn_network_stats_impl.hpp" -// #define DEBUG_DUMP_PATH "/temp/path/dump/" -// #define DEBUG_DUMP_NEW_FOLDER_PER_INFER -#ifdef DEBUG_DUMP_PATH -#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp" -#include <iomanip> -// #define DEBUG_BMP_OUTPUT 1 + +#include "utils/blob_dump.h" + +/***************************************************** + * Dump capability + * Specify path to dump folder in BLOB_DUMP_PATH + *****************************************************/ +// #define BLOB_DUMP_PATH "dump" + +#ifdef BLOB_DUMP_PATH +# define DUMP_DIR BLOB_DUMP_PATH +# define ENABLE_DUMP(_x) { _x ;} +#else +# define DUMP_DIR "" +# define ENABLE_DUMP(_x) #endif using namespace mkldnn; @@ -56,37 +68,11 @@ using namespace MKLDNNPlugin::cpu; using namespace InferenceEngine; using namespace InferenceEngine::details; -void BindThreads(mkldnn::engine eng) { - static bool alreadyBind = false; - if (!alreadyBind) { -#if IE_THREAD == IE_THREAD_OMP - int env_cores = 0; - if (getenv("OMP_NUM_THREADS") != nullptr) { - try { - env_cores = std::stoi(std::string(getenv("OMP_NUM_THREADS"))); - } catch (...) { - env_cores = 0; - } - } -#if !(defined(__APPLE__) || defined(_WIN32)) - OpenMpManager::setGpuDisabled(); - OpenMpManager::bindOpenMpThreads(env_cores); -#else - int num_cores = env_cores == 0 ? OpenMpManager::getOpenMpThreadNumber() : env_cores; - parallel_set_num_threads(num_cores); -#endif -#endif - alreadyBind = true; - } -} - -void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) { +void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) { if (IsReady()) { ForgetGraphData(); } - if (config.useThreadBinding) BindThreads(eng); - // go over the inputs and create input primitives InputsDataMap inputs; network.getInputsInfo(inputs); @@ -273,6 +259,9 @@ void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager CreatePrimitives(); + // Will do it before cleanup. Because it will lose original layers information + if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_init.dot"); + for (auto &graphNode : graphNodes) { graphNode->cleanup(); } @@ -378,15 +367,31 @@ void MKLDNNGraph::ParseNode(const CNNLayerPtr& cnnLayer, MKLDNNNodePtr& parent, if (exists) return; + if (cnnLayer->blobs.find("ext-scale") != cnnLayer->blobs.end()) + node->ext_scales = cnnLayer->blobs["ext-scale"]; + graphNodes.push_back(node); size_t count_out = 0; + std::vector<ParsedLayer> remaining; for (const auto &layer : cnnLayer->outData) { + bool first = true; for (const auto &data : layer->getInputTo()) { - queuelayers.push_back({node, data.second, count_out}); + if (first) { + queuelayers.push_back({node, data.second, count_out}); + first = false; + } else { + // TODO: Just to hide bug with port ordering. + // At first step we visit only first connection + // at port. As second we will visit all remaining. + // + // Not first connection to the port are stored here + remaining.push_back({node, data.second, count_out}); + } } count_out++; } + queuelayers.insert(queuelayers.end(), remaining.begin(), remaining.end()); } void MKLDNNGraph::InitNodes() { @@ -416,58 +421,6 @@ void MKLDNNGraph::InitEdges() { if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) { inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat()); outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat()); - } else if (inArgs.empty() && outArgs.empty()) { - // This detailed name disabled by request from ICV team -#if 0 - auto parentBlk = parentDesc.getBlockingDesc(); - auto childBlk = childDesc.getBlockingDesc(); - std::string order_in, order_out, stride_in, stride_out, dims_in, dims_out, off_in, off_out; - for (size_t i = 0; i < parentBlk.getBlockDims().size(); i++) { - if (i) { - stride_in += ","; - order_in += ","; - dims_in += ","; - off_in += ","; - } - stride_in += std::to_string(parentBlk.getStrides()[i]); - order_in += std::to_string(parentBlk.getOrder()[i]); - dims_in += std::to_string(parentBlk.getBlockDims()[i]); - off_in += std::to_string(parentBlk.getOffsetPaddingToData()[i]); - } - for (size_t i = 0; i < childBlk.getBlockDims().size(); i++) { - if (i) { - stride_out += ","; - order_out += ","; - dims_out += ","; - off_out += ","; - } - stride_out += std::to_string(childBlk.getStrides()[i]); - order_out += std::to_string(childBlk.getOrder()[i]); - dims_out += std::to_string(childBlk.getBlockDims()[i]); - off_out += std::to_string(childBlk.getOffsetPaddingToData()[i]); - } - - if (parentBlk.getOffsetPadding() != childBlk.getOffsetPadding()) { - inArgs += (inArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(parentBlk.getOffsetPadding()); - outArgs += (outArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(childBlk.getOffsetPadding()); - } - if (parentBlk.getStrides() != childBlk.getStrides()) { - inArgs += (inArgs.empty() ? "" : "_") + std::string("str:") + stride_in; - outArgs += (outArgs.empty() ? "" : "_") + std::string("str:") + stride_out; - } - if (parentBlk.getOrder() != childBlk.getOrder()) { - inArgs += (inArgs.empty() ? "" : "_") + std::string("ord:") + order_in; - outArgs += (outArgs.empty() ? "" : "_") + std::string("ord:") + order_out; - } - if (parentBlk.getBlockDims() != childBlk.getBlockDims()) { - inArgs += (inArgs.empty() ? "" : "_") + std::string("dim:") + dims_in; - outArgs += (outArgs.empty() ? "" : "_") + std::string("dim:") + dims_out; - } - if (parentBlk.getOffsetPaddingToData() != childBlk.getOffsetPaddingToData()) { - inArgs += (inArgs.empty() ? "" : "_") + std::string("offs:") + off_in; - outArgs += (outArgs.empty() ? "" : "_") + std::string("offs:") + off_out; - } -#endif } return inArgs + "_" + outArgs; }; @@ -529,7 +482,7 @@ static inline bool isConstOutput(MKLDNNEdgePtr edge) { void MKLDNNGraph::AllocateWithReuse() { std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters; - // detect edge clasters which are view on one. + // detect edge clusters which are view on one. for (auto &edge : graphEdges) { MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated) ? edge->getSharedEdge() @@ -606,7 +559,7 @@ void MKLDNNGraph::AllocateWithReuse() { int e_size = block_desk.getOffsetPadding() + 1; // size in elements (from begin of data to last element) for (int j = 0; j < block_desk.getBlockDims().size(); j++) - e_size += (block_desk.getBlockDims()[j] - 1 ) * block_desk.getStrides()[j]; + e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j]; box.start = std::min(e_start, box.start); box.finish = std::max(e_finish, box.finish); @@ -754,139 +707,9 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) { MB_to_process = std::min<int>(config.batchLimit, MB_to_process); size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB; - memcpy(ext_blob_ptr, intr_blob_ptr, size_to_copy); - } -} - -#ifdef DEBUG_BMP_OUTPUT -#include <sys/types.h> -#include <sys/stat.h> - -#define STB_IMAGE_WRITE_IMPLEMENTATION -#include "../../thirdparty/stb_lib/stb_image_write.h" - -#if defined(_WIN32) -#define mkdir(dir, mode) _mkdir(dir) -#endif - -void dump_as_bitmaps(const std::string name, const float* data, - const SizeVector& cdims, - mkldnn::impl::memory_format_t format = mkldnn::impl::memory_format::nchw) { - std::string dir_name = name + "_bmp_dir/"; - mkdir(dir_name.c_str(), 0755); - - std::ofstream layer_bmp_log; - layer_bmp_log.open(dir_name + "bmp_dump_log.txt"); - layer_bmp_log << "Format " << format << std::endl; - - if (cdims.size() == 1) { - layer_bmp_log << "Only one dimension: " << cdims[0] << std::endl; - layer_bmp_log.close(); - return; - } - - SizeVector dims(cdims.rbegin(), cdims.rend()); - - size_t x = dims[0], y = dims[1], total_images = 1; - size_t img_sz = x*y; - - for (size_t k = 0; k < dims.size(); ++k) - if (dims[k]) - total_images *= dims[k]; - - total_images /= img_sz; - - // sanity checks - if (img_sz < 100) { - layer_bmp_log << "Image size is too small" << std::endl; - layer_bmp_log.close(); - return; - } else if (x < 10 || y < 10 || x > 2048 || y > 2048) { - layer_bmp_log << "Dimensions are unapropriate to dump - " << y << "x" << x << std::endl; - layer_bmp_log.close(); - return; - } else { - float ratio = static_cast<float>(x) / static_cast<float>(y); - if (ratio < 1.0) ratio = 1.0 / ratio; - - if (ratio > 8.f) { - layer_bmp_log << "Suspicious aspect ratio - " << ratio << std::endl; - layer_bmp_log.close(); - return; - } - } - - layer_bmp_log << total_images << " images to write ..." << std::endl; - - const float* dataPtr = data; - for (size_t img = 0; img < total_images; img++) { - std::string img_name = "img" + std::to_string(img) + ".bmp"; - - // copy image plane to separate buffer, - // normalize and convert to 3-channel 8-bit bmp - std::vector<float> imgbuf(img_sz); - int stride = 1; - switch (format) { - case mkldnn::impl::memory_format::nChw8c: - stride = 8; - break; - case mkldnn::impl::memory_format::nChw16c: - stride = 16; - break; - case mkldnn::impl::memory_format::nchw: - default: - break; - } - - float maxval = -FLT_MAX, minval = FLT_MAX; - for (size_t i = 0; i < y; i++) - for (size_t j = 0; j < x; j++) { - float val = dataPtr[(i*x + j) * stride]; - if (val > maxval) maxval = val; - if (val < minval) minval = val; - imgbuf[i*x + j] = val; - } - - if (minval >= 0.f && maxval <= 0.f) { - layer_bmp_log << img_name << " all zero." << std::endl; - } else { - const float mult = 256.f / (maxval - minval); - std::vector<unsigned char> bmpbuf(img_sz * 3); - unsigned char* bmp_ptr = bmpbuf.data(); - - for (int i = 0; i < imgbuf.size(); i++, bmp_ptr += 3) { - if (imgbuf[i] >= 0.f && imgbuf[i] <= 0.f) { - bmp_ptr[0] = 65; - bmp_ptr[1] = bmp_ptr[2] = 0; - } else { - bmp_ptr[0] = bmp_ptr[1] = bmp_ptr[2] = (unsigned char)((imgbuf[i] - minval) * mult); - } - } - - // write bmp file - std::string full_name = dir_name + img_name; - stbi_write_bmp(full_name.c_str(), x, y, 3, (const void *)bmpbuf.data()); - } - - switch (format) { - case mkldnn::impl::memory_format::nChw8c: - if ( ( img & 7 ) < 7 ) dataPtr++; - else dataPtr += img_sz * 8; - break; - case mkldnn::impl::memory_format::nChw16c: - if ( ( img & 15 ) < 15 ) dataPtr++; - else dataPtr += img_sz * 16; - break; - case mkldnn::impl::memory_format::nchw: - default: - dataPtr += img_sz; - break; - } + ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy); } - - layer_bmp_log.close(); } -#endif void MKLDNNGraph::Infer(int batch) { if (!IsReady()) { @@ -894,175 +717,20 @@ void MKLDNNGraph::Infer(int batch) { } mkldnn::stream stream = mkldnn::stream(stream::kind::eager); -#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER - static int folderIdx = 0; - folderIdx++; -#endif for (int i = 0; i < graphNodes.size(); i++) { PERF(graphNodes[i]); if (batch > 0) graphNodes[i]->setDynamicBatchLim(batch); + ENABLE_DUMP(do_before(DUMP_DIR, graphNodes[i])); + if (!graphNodes[i]->isConstant()) { IE_PROFILING_AUTO_SCOPE_TASK(graphNodes[i]->profilingTask) graphNodes[i]->execute(stream); } -#ifdef DEBUG_DUMP_PATH - { - auto folderName = std::string(DEBUG_DUMP_PATH) + -#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER - std::to_string(folderIdx - 1) + -#endif - "/"; - std::cout << "Try to create logs for " << graphNodes[i]->getName() << std::endl; - std::string nodeName = graphNodes[i]->name; - std::replace(nodeName.begin(), nodeName.end(), '/', '_'); - std::ofstream layer_data_dump; - for (size_t j = 0; j < graphNodes[i]->getChildEdges().size(); j++) { - auto childEdge = graphNodes[i]->getChildEdgeAt(j); - std::string childName = graphNodes[i]->getChildEdgeAt(j)->getChild()->getName(); - std::replace(childName.begin(), childName.end(), '/', '_'); - - // std::string fname = DEBUG_DUMP_PATH + nodeName + "_dst_" + childName + "_" + std::to_string(j) + ".txt"; - std::string tname = folderName + nodeName + "_dst_" + childName + "_" + std::to_string(j); - std::string fname = tname + ".txt"; - if (graphNodes[i]->getChildEdges().size() == 1) { - fname = folderName + nodeName + "_dst.txt"; - } - layer_data_dump.open(fname); - if (layer_data_dump.is_open()) { - float *data = static_cast<float *>(childEdge->getMemory().GetData()); - mkldnn::impl::memory_desc_wrapper dst_d(childEdge->getMemory().GetDescriptor().data); - #ifdef DEBUG_BMP_OUTPUT - dump_as_bitmaps(tname, data, childEdge->getDims().ToSizeVector(), dst_d.format()); - #endif - - layer_data_dump << "shape: "; - for (size_t d = 0; d < childEdge->getDims().ndims(); d++) - layer_data_dump << childEdge->getDims()[d] << " "; - layer_data_dump << "(" << dst_d.nelems() << ")" << std::endl; - if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::FP32) { - float *data = childEdge->getBlob()->buffer(); - for (size_t bs = 0; bs < dst_d.nelems(); bs++) { - layer_data_dump << std::fixed << std::setprecision(3) << data[dst_d.off_l(bs)] << std::endl; - } - } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I8) { - int8_t *data = childEdge->getBlob()->buffer(); - for (size_t bs = 0; bs < dst_d.nelems(); bs++) { - layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl; - } - } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::U8) { - uint8_t *data = childEdge->getBlob()->buffer(); - for (size_t bs = 0; bs < dst_d.nelems(); bs++) { - layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl; - } - } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I32) { - int32_t *data = childEdge->getBlob()->buffer(); - for (size_t bs = 0; bs < dst_d.nelems(); bs++) { - layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl; - } - } - - layer_data_dump.close(); - } else { - std::cout << "Cannot create file " << fname << std::endl; - } - } - - for (size_t p = 0 ; p < graphNodes[i]->getParentEdges().size(); p++) { - auto parentEdge = graphNodes[i]->getParentEdgeAt(p); - auto parent = parentEdge->getParent(); - std::string parentName = parent->getName(); - std::replace(parentName.begin(), parentName.end(), '/', '_'); - // std::string fname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p) + ".txt"; - std::string tname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p); - std::string fname = tname + ".txt"; - layer_data_dump.open(fname); - if (layer_data_dump.is_open()) { - size_t dataSize = graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetSize(); - mkldnn::impl::memory_desc_wrapper src_d(graphNodes[i]->getParentEdges()[p] - .lock()->getMemory().GetDescriptor().data); - #ifdef DEBUG_BMP_OUTPUT - dump_as_bitmaps(tname, data, parentEdge->getDims().ToSizeVector(), src_d.format()); - #endif - layer_data_dump << "shape: "; - for (size_t d = 0; d < parentEdge->getDims().ndims(); d++) - layer_data_dump << parentEdge->getDims()[d] << " "; - layer_data_dump << "(" << src_d.nelems() << ")"<< std::endl; - auto precision = graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision(); - if (precision == Precision::FP32) { - float *data = static_cast<float *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData()); - for (size_t bs = 0; bs < dataSize; bs++) { - layer_data_dump << std::fixed << std::setprecision(3) << data[src_d.off_l(bs)] << std::endl; - } - } else if (precision == Precision::I8) { - int8_t *data = static_cast<int8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData()); - for (size_t bs = 0; bs < dataSize; bs++) { - layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl; - } - } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::U8) { - uint8_t *data = static_cast<uint8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData()); - for (size_t bs = 0; bs < dataSize; bs++) { - layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl; - } - } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::I32) { - int32_t *data = static_cast<int32_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData()); - for (size_t bs = 0; bs < dataSize; bs++) { - layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl; - } - } else { - layer_data_dump << "Unsupported precision: " << precision.name() << std::endl; - } - - layer_data_dump.close(); - } else { - std::cout << "Cannot create file " << fname << std::endl; - } - } - - GenericLayer* genericLayer = dynamic_cast<GenericLayer*>(graphNodes[i]->getCnnLayer().get()); - if (genericLayer != nullptr) { - for (auto blob : genericLayer->blobs) { - layer_data_dump.open(folderName + nodeName + "_blob-" + blob.first + ".txt"); - if (layer_data_dump.is_open()) { - layer_data_dump << "shape: "; - for (size_t d = 0; d < blob.second->dims().size(); d++) - layer_data_dump << blob.second->dims()[d] << " "; - layer_data_dump << "(" << blob.second->size() << ")"<< std::endl; - if (blob.second->getTensorDesc().getPrecision() == Precision::FP32) { - float *data = blob.second->buffer(); - for (size_t bs = 0; bs < blob.second->size(); bs++) { - layer_data_dump << std::fixed << std::setprecision(3) << data[bs] << std::endl; - } - } else if (blob.second->getTensorDesc().getPrecision() == Precision::I8) { - int8_t *data = blob.second->buffer(); - for (size_t bs = 0; bs < blob.second->size(); bs++) { - layer_data_dump << static_cast<int>(data[bs]) << std::endl; - } - } else if (blob.second->getTensorDesc().getPrecision() == Precision::U8) { - uint8_t *data = blob.second->buffer(); - for (size_t bs = 0; bs < blob.second->size(); bs++) { - layer_data_dump << static_cast<int>(data[bs]) << std::endl; - } - } else if (blob.second->getTensorDesc().getPrecision() == Precision::I32) { - int32_t *data = blob.second->buffer(); - for (size_t bs = 0; bs < blob.second->size(); bs++) { - layer_data_dump << static_cast<int>(data[bs]) << std::endl; - } - } else { - layer_data_dump << "Unsupported precision: " << blob.second->getTensorDesc().getPrecision().name() << std::endl; - } - layer_data_dump.close(); - } else { - std::cout << "Cannot create file " << folderName << nodeName - << "_" << blob.first << ".txt" << std::endl; - } - } - } - } -#endif + ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i])); } } @@ -1153,6 +821,8 @@ void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEn for (int i = 1; i < graphNodes.size(); i++) { getPerfMapFor(perfMap, graphNodes[i]); } + + if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot"); } void MKLDNNGraph::setConfig(const Config &cfg) { @@ -1257,7 +927,56 @@ void MKLDNNGraph::RemoveDroppedEdges() { } } -bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const { +void MKLDNNGraph::dumpToDotFile(std::string file) const { + std::ofstream dot; + dot.open(file); + if (!dot.is_open()) THROW_IE_EXCEPTION << "CPU Plugin cannot create dot file " << file << "."; + + dump_graph_as_dot(*this, dot); + dot.close(); +} + +void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) { + auto exec_order = std::to_string(node->execIndex); + std::string nodeName = node->name; + std::replace(nodeName.begin(), nodeName.end(), '/', '_'); + + auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size(); + for (size_t i = 0; i < num_ports; i++) { + auto prEdge = node->getParentEdgeAt(i); + auto pr = prEdge->getParent(); + + auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_in" + std::to_string(i) + ".ieb"; + TensorDesc desc = prEdge->getDesc(); + Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData()); + + BlobDumper dumper(blob); + if (pr->ext_scales) dumper.withScales(pr->ext_scales); + dumper.dump(dump_file); + } +} + +void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) { + auto exec_order = std::to_string(node->execIndex); + auto nodeName = node->name; + std::replace(nodeName.begin(), nodeName.end(), '/', '_'); + + auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size(); + for (size_t i = 0; i < num_ports; i++) { + auto childEdge = node->getChildEdgeAt(i); + + auto dump_file = dir + "/#" + exec_order + "_" + nodeName + "_out" + std::to_string(i) + ".ieb"; + TensorDesc desc = childEdge->getDesc(); + Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData()); + + BlobDumper dumper(blob); + if (node->ext_scales) dumper.withScales(node->ext_scales); + + dumper.dump(dump_file); + } +} + +bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const { InputsDataMap inputs; network.getInputsInfo(inputs); @@ -1274,6 +993,11 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network bool check_result = true; details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) { auto type = TypeFromName(layer->type); + // This is WA for Tile layer + auto tileLayer = dynamic_cast<TileLayer *>(layer.get()); + if (tileLayer && tileLayer->axis) + return; + if (type != Input && type != Output && type != Convolution && @@ -1283,6 +1007,7 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network type != Lrn && type != Pooling && type != FullyConnected && + type != Gemm && type != SoftMax && type != Split && type != Concatenation && @@ -1301,55 +1026,87 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network InferenceEngine::InferRequestInternal::Ptr MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs, InferenceEngine::OutputsDataMap networkOutputs) { - return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs); + if (graphs.size() > 1) // streams uses special requests that are not connected to graphs + return std::make_shared<MKLDNNGraphlessInferRequest>(networkInputs, networkOutputs); + else + return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs); } -MKLDNNExecNetwork::MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network, +MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg, const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) { - graph.reset(new MKLDNNGraph()); - graph->setConfig(cfg); + ICNNNetworkStats* pstats = nullptr; + StatusCode s = network.getStats(&pstats, nullptr); + // we are cloning network if we have statistics and we can transform network + // in other case we pass original network. Especially because LSTM networks + // are not cloned properly + details::CNNNetworkImplPtr clonedNetwork; + if (s == StatusCode::OK && pstats && !pstats->isEmpty()) { + CNNNetworkInt8Normalizer cnnorm; + clonedNetwork = cloneNet(network); + cnnorm.NormalizeNetwork(*clonedNetwork, *pstats); + } + bool ti_proc_ok = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true; + if (!ti_proc_ok) + THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. " + "None TI optimization pattern has been applied successfully"; + if (cfg.batchLimit > 1) { // check topology for applicability - if (!CanProcessDynBatch(network)) { + if (!CanProcessDynBatch(clonedNetwork ? *clonedNetwork : network)) { THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!"; } } + // check whether any (affinity-related) envs are set and if user requested thread binding + const bool bPinningRequested = !check_env_variables() && cfg.useThreadBinding; + // general #threads logic + const int env_threads = parallel_get_env_threads(); + // for streams need all (logical) cores, while single-stream case just physical cores (better for servers), as usual + const int hw_cores = cfg.throughputStreams > 1 ? parallel_get_max_threads() : getNumberOfCPUCores(); + const int threads = cfg.threadsNum ? cfg.threadsNum : (env_threads ? env_threads : hw_cores); + const int threads_per_stream = std::max(1, threads/cfg.throughputStreams); + + // graph(s) initialization in taskExecutor threads (streams), in parallel (in case of streams) + std::vector<Task::Ptr> tasks; + + for (int n = 0; n < cfg.throughputStreams; n++) { + MKLDNNGraph::Ptr _graph = std::make_shared<MKLDNNGraph>(); + graphs.push_back(_graph); + auto task = std::make_shared<InferenceEngine::Task>([=, &cfg, &network]() { + _graph->CreateArena(threads_per_stream); + + if (bPinningRequested) { + _graph->CreateObserver(n, threads_per_stream); + } - if (graph->getProperty().exclusiveAsyncRequests) { - ExecutorManager *executorManager = ExecutorManager::getInstance(); - _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU)); + _graph->setConfig(cfg); + _graph->CreateGraph(clonedNetwork ? *clonedNetwork : network, extensionManager); + if (cfg.throughputStreams > 1) // for streams, each worker thread has it's own graph + MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph; + }); + tasks.push_back(task); } - // initialization in taskExecutor thread - auto task = std::make_shared<InferenceEngine::Task>([&]() { - // we are cloning network if we have statistics and we can transform network - // in other case we pass original network. Especially because LSTM networks - // are not cloned properly - ICNNNetworkStats* pstats = nullptr; - StatusCode s = network.getStats(&pstats, nullptr); - Xbyak::util::Cpu cpu; - // Enable int8 only for avx512 - if (s == StatusCode::OK && pstats && !pstats->isEmpty() && cpu.has(Xbyak::util::Cpu::tAVX512F)) { - details::CNNNetworkImplPtr clonnedNetwork = cloneNet(network); - CNNNetworkInt8Normalizer cnnorm; - cnnorm.NormalizeNetwork(*clonnedNetwork, *pstats); - graph->CreateGraph(*clonnedNetwork, extensionManager); - } else { - graph->CreateGraph(network, extensionManager); + if (cfg.throughputStreams > 1) { + // special executor with as many threads as requested #streams, each with it's own initialization task + _taskExecutor = std::make_shared<MultiWorkerTaskExecutor>(tasks); + } else { + if (cfg.exclusiveAsyncRequests) { + // special case when all InferRequests are muxed into a single queue + ExecutorManager *executorManager = ExecutorManager::getInstance(); + _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU)); } - }); - - _taskExecutor->startTask(task); - Task::Status sts = task->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); - - if (sts == Task::TS_ERROR) task->checkException(); + _taskExecutor->startTask(tasks[0]); + Task::Status sts = tasks[0]->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); + } + for (auto t : tasks) + t->checkException(); } void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &properties) { - if (graph) // TODO: graph field cannot be empty - graph->setProperty(properties); + for (auto g : graphs) + g->setProperty(properties); } void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) { @@ -1362,13 +1119,10 @@ void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr & asyncRequestImpl->SetPointerToPublicInterface(asyncRequest); - auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get()); - if (!mkldnnSyncRequest) - THROW_IE_EXCEPTION << " Cannot get mkldnn sync request."; - mkldnnSyncRequest->SetGraph(graph); -} - -MKLDNNExecNetwork::~MKLDNNExecNetwork() { - graph.reset(); - extensionManager.reset(); + if (graphs.size() == 1) { // single-stream (legacy/hetero) case - single graph for all requests + auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get()); + if (!mkldnnSyncRequest) + THROW_IE_EXCEPTION << " Cannot get mkldnn sync request."; + mkldnnSyncRequest->SetGraph(graphs[0]); + } } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h index d1fdb0fe9..de026b5ad 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +10,7 @@ #include <memory> #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp> +#include "ie_parallel.hpp" #include "mkldnn_memory.h" #include "config.h" #include "perf_count.h" @@ -19,6 +19,7 @@ #include "mkldnn_node.h" #include "mkldnn_edge.h" #include "mkldnn_extension_utils.h" +#include "mkldnn_streams.h" namespace MKLDNNPlugin { @@ -48,7 +49,7 @@ public: void getInputBlobs(InferenceEngine::BlobMap &in_map); void getOutputBlobs(InferenceEngine::BlobMap &out_map); - void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr); + void CreateGraph(const InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr); bool hasMeanImageFor(const std::string& name) { return _meanImages.find(name) != _meanImages.end(); @@ -81,6 +82,35 @@ public: void RemoveDroppedEdges(); void DropNode(const MKLDNNNodePtr& node); + void CreateArena(int threads_per_stream) { + #if IE_THREAD == IE_THREAD_OMP + omp_set_num_threads(threads_per_stream); + #elif IE_THREAD == IE_THREAD_TBB + ptrArena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena(threads_per_stream)); + #endif + } + + void CreateObserver(int _stream_id, int _threads_per_stream, int _pinning_step = 1) { + #if IE_THREAD == IE_THREAD_TBB + ptrObserver + = std::unique_ptr<tbb::task_scheduler_observer>( + new pinning_observer(*ptrArena.get(), _stream_id, _threads_per_stream, _pinning_step)); + #else + cpu_set_t *process_mask = nullptr; + int ncpus = 0; + get_process_mask(ncpus, process_mask); + #if IE_THREAD == IE_THREAD_OMP + #pragma omp parallel for + for (int thread_index = 0; thread_index < _threads_per_stream; thread_index++) { + pin_thread_to_vacant_core(_stream_id * _threads_per_stream + thread_index, 1, ncpus, process_mask); + } + #elif IE_THREAD == IE_THREAD_SEQ + pin_thread_to_vacant_core(_stream_id * _threads_per_stream, 1, ncpus, process_mask); + #endif + CPU_FREE(process_mask); + #endif + } + protected: MKLDNNNodePtr FindNodeWithName(const std::string& name) const; void VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes); @@ -108,6 +138,10 @@ protected: std::map<std::string, MeanImage> _meanImages; + #if IE_THREAD == IE_THREAD_TBB + std::unique_ptr<tbb::task_arena> ptrArena; + std::unique_ptr<tbb::task_scheduler_observer> ptrObserver; + #endif mkldnn::engine eng; void InitNodes(); @@ -116,13 +150,15 @@ protected: void AllocateWithReuse(); void CreatePrimitives(); - void BreakEdgeInsertScaleShift(MKLDNNPlugin::MKLDNNEdgePtr edgeToBreak, - InferenceEngine::CNNLayerPtr ssCnnLayer); - void AddScaleShiftBeforeAndAfterInt8(InferenceEngine::CNNNetwork& net); + void do_before(const std::string &dir, const MKLDNNNodePtr &node); + void do_after(const std::string &dir, const MKLDNNNodePtr &node); friend class MKLDNNInferRequest; + friend class MKLDNNGraphlessInferRequest; + friend std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph); private: + void dumpToDotFile(std::string file) const; struct ParsedLayer { MKLDNNNodePtr parent; InferenceEngine::CNNLayerPtr cnnLayer; @@ -142,18 +178,21 @@ public: void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override; - MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network, const Config &cfg, + MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg, const MKLDNNExtensionManager::Ptr& extMgr); - ~MKLDNNExecNetwork() override; + ~MKLDNNExecNetwork() { + graphs.clear(); + extensionManager.reset(); + } void setProperty(const std::map<std::string, std::string> &properties); protected: - MKLDNNGraph::Ptr graph; + std::vector<MKLDNNGraph::Ptr> graphs; MKLDNNExtensionManager::Ptr extensionManager; - bool CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const; + bool CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp new file mode 100644 index 000000000..ae24579f6 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp @@ -0,0 +1,207 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_graph_dumper.h" +#include "cnn_network_impl.hpp" +#include "ie_util_internal.hpp" + +#include <vector> +#include <string> +#include <memory> +#include <map> + +using namespace InferenceEngine; + +namespace MKLDNNPlugin { + +static void copy_node_metadata(const MKLDNNNodePtr &, CNNLayer::Ptr &); +static void drawer_callback(const InferenceEngine::CNNLayerPtr, ordered_properties &, ordered_properties &); + +CNNLayer::Ptr convert_node(const MKLDNNNodePtr &node) { + CNNLayer::Ptr layer(new CNNLayer({"name", "type", Precision::FP32})); + copy_node_metadata(node, layer); + + auto &cfg = node->getSelectedPrimitiveDescriptor()->getConfig(); + layer->insData.resize(cfg.inConfs.size()); + layer->outData.resize(cfg.outConfs.size()); + + return layer; +} + +std::shared_ptr<ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph) { + auto net = std::make_shared<details::CNNNetworkImpl>(); + + net->setPrecision(Precision::FP32); + net->setName("internal_cpu_graph"); + std::map<MKLDNNNodePtr, CNNLayerPtr> node2layer; + + // Copy all nodes to network + for (auto &node : graph.graphNodes) { + auto layer = convert_node(node); + node2layer[node] = layer; + net->addLayer(layer); + } + + // Copy all edges to network + for (auto &node : graph.graphNodes) { + auto pr = node2layer[node]; + auto ch_edges = node->getChildEdges(); + + for (int i = 0; i < ch_edges.size(); i++) { + auto edge = node->getChildEdgeAt(i); + int out_port = edge->getInputNum(); + int in_port = edge->getOutputNum(); + auto ch_node = edge->getChild(); + auto ch = node2layer[ch_node]; + + DataPtr data; + if (i < pr->outData.size()) { + std::string data_name = node->getName() + "_out" + std::to_string(i); + pr->outData[i] = std::make_shared<Data>(data_name, edge->getDesc()); + data = pr->outData[i]; + data->creatorLayer = pr; + } else { + data = pr->outData[0]; + } + + data->inputTo[ch->name] = ch; + ch->insData[in_port] = data; + } + } + + // Specify inputs data + for (auto kvp : graph.inputNodes) { + auto in_node = kvp.second; + auto in_layer = node2layer[in_node]; + + auto in_info = std::make_shared<InputInfo>(); + in_info->setInputData(in_layer->outData[0]); + net->setInputInfo(in_info); + } + + return net; +} + +void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out) { + auto dump_net = dump_graph_as_ie_net(graph); + InferenceEngine::saveGraphToDot(*dump_net, out, drawer_callback); +} + +//********************************** +// Special converters of meta data +//********************************** + +static std::map<Type, std::string> type_n2l { + {Unknown, "Unknown"}, + {Generic, "Unknown"}, + {Reorder, "Reorder"}, + {Copy, "Reorder"}, + {Input, "Input"}, + {Output, "Output"}, + {Convolution, "Conv"}, + {Deconvolution, "Deconv"}, + {Convolution_Sum, "Conv_Eltw"}, + {Convolution_Activation, "Conv_Activ"}, + {Convolution_Sum_Activation, "Conv_Eltw_Activ"}, + {Activation, "Activation"}, + {Depthwise, "Depthwise"}, + {Lrn, "Lrn"}, + {Pooling, "Pool"}, + {FullyConnected, "FC"}, + {SoftMax, "SoftMax"}, + {Split, "Split"}, + {Concatenation, "Concat"}, + {Power, "Power"}, + {Eltwise, "Eltwise"}, + {Crop, "Crop"}, + {Reshape, "Reshape"}, + {Tile, "Tile"}, + {SimplerNMS, "Proposal"}, + {ROIPooling, "ROIPooling"}, + {BatchNormalization, "BatchNorm"}, + {Flatten, "Flatten"}, + {Permute, "Permute"}, + {MemoryOutput, "MemoryIn"}, + {MemoryInput, "MemoryOut"} +}; + +static const std::string ORIGIN_NAMES = "origin"; +static const std::string IMPL_TYPE = "impl"; +static const std::string PRECISION = "prec"; +static const std::string PERF_COUNTER = "perf"; + +static const std::string BLUE = "#D8D9F1"; +static const std::string GREEN = "#D9EAD3"; + +void copy_node_metadata(const MKLDNNNodePtr &node, CNNLayer::Ptr &layer) { + layer->type = type_n2l[node->getType()]; + layer->name = node->getName(); // Is ID + + if (node->getCnnLayer()) { + // Original layer names + std::vector<MKLDNNNodePtr> internal = node->getFusedWith(); + auto &merged = node->getMergeWith(); + internal.insert(internal.end(), merged.begin(), merged.end()); + + std::string orig_names = node->getCnnLayer()->name; + for (auto &sub_node : internal) + orig_names += " " + sub_node->getCnnLayer()->name; + + layer->params[ORIGIN_NAMES] = orig_names; + } + + // Implementation type name + layer->params[IMPL_TYPE] = node->getPrimitiveDescriptorType(); + + // Precision + // TODO: That is not fully correct mapping type to precision. + std::string precision = "FP32"; + auto desc = node->getSelectedPrimitiveDescriptor(); + if (desc == nullptr) { + THROW_IE_EXCEPTION << "Internal error - descriptor is empty"; + } + impl_desc_type impl_type = desc->getImplementationType(); + + if (impl_type == gemm_blas && + node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8) precision = "INT8"; + + if (impl_type & jit && impl_type & avx512 && + node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8) precision = "INT8"; + + layer->params[PRECISION] = precision; + + // Performance + if (node->PerfCounter().avg() != 0) { + layer->params[PERF_COUNTER] = std::to_string(node->PerfCounter().avg())+ " mcs"; + } +} + +void drawer_callback(const InferenceEngine::CNNLayerPtr layer, + ordered_properties &printed_properties, + ordered_properties &node_properties) { + const auto ¶ms = layer->params; + + // Implementation + auto impl = params.find(IMPL_TYPE); + if (impl != params.end()) { + printed_properties.push_back({"impl", impl->second}); + } + + // Original names + auto orig = params.find(ORIGIN_NAMES); + if (orig != params.end()) { + printed_properties.push_back({"originals", orig->second}); + } + + // Precision + auto prec = params.find(PRECISION); + if (prec != params.end()) { + printed_properties.push_back({"precision", prec->second}); + } + + // Set color + node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE}); +} + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h new file mode 100644 index 000000000..6ec5ffc45 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h @@ -0,0 +1,18 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_icnn_network.hpp" +#include "mkldnn_graph.h" + +#include <memory> + +namespace MKLDNNPlugin { + + void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out); + + std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph); + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index 3be1fbf23..6c88ebd6f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -144,20 +143,27 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) { } void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) { + auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) { + for (auto a : algs) { + if (alg == a) { + return true; + } + } + return false; + }; + auto& graphNodes = graph.GetNodes(); - auto isFusingSupported = [&](MKLDNNNodePtr node) { - if (!node->getCnnLayer()) + auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) { + if (!activation->getCnnLayer()) return false; - auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get()); + auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get()); return activationNode && - (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp); + (activationNode->getAlgorithm() == eltwise_relu || + (conv->getCnnLayer()->precision == Precision::FP32 && + isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp}))); }; for (int i = 0; i < graphNodes.size(); i++) { @@ -172,13 +178,13 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) { if (conv->getChildEdges().size() == 1) { auto ch1 = conv->getChildEdgeAt(0)->getChild(); - if (isFusingSupported(ch1)) { + if (isFusingSupported(conv, ch1)) { fuse(ch1); if (ch1->getChildEdges().size() == 1) { auto ch2 = ch1->getChildEdgeAt(0)->getChild(); - if (isFusingSupported(ch2)) { + if (isFusingSupported(conv, ch2)) { fuse(ch2); graph.DropNode(ch2); } @@ -193,7 +199,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) { if (is_max_pool && pool->getChildEdges().size() == 1) { auto ch2 = pool->getChildEdgeAt(0)->getChild(); - if (isFusingSupported(ch2)) { + if (isFusingSupported(conv, ch2)) { fuse(ch2); graph.DropNode(ch2); } @@ -274,8 +280,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) { auto isSutableChildConvolution = [](MKLDNNNodePtr node) { auto* layer = dynamic_cast<ConvolutionLayer*>(node->getCnnLayer().get()); - auto allPads = getConvPaddings(*layer); + auto allPads = getPaddings(*layer); bool isSupportedParams = layer->_out_depth == layer->_group && + + layer->_out_depth != 1 && + // Depthwise convolution output should be multiple of 8 + layer->_kernel[X_AXIS] == 3 && layer->_kernel[Y_AXIS] == 3 && allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 && layer->_dilation[X_AXIS] == 1 && layer->_dilation[Y_AXIS] == 1 && @@ -379,18 +389,25 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent, void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) { std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes(); - auto isFusingSupported = [&](MKLDNNNodePtr node) { - if (!node->getCnnLayer()) + auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) { + for (auto a : algs) { + if (alg == a) { + return true; + } + } + return false; + }; + + auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) { + if (!activation->getCnnLayer()) return false; - auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get()); + auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get()); return activationNode && - (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu || - activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp); + (activationNode->getAlgorithm() == eltwise_relu || + (conv->getCnnLayer()->precision == Precision::FP32 && + isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp}))); }; for (auto &graphNode : graphNodes) { @@ -411,6 +428,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG auto mergedConv = (parent1->getType() == Convolution) ? parent1 : parent2; auto peerNode = (parent1->getType() == Convolution) ? parent2 : parent1; + if (peerNode->getType() == Convolution && mergedConv->getChildEdges().size() != 1) { + mergedConv = parent2; + peerNode = parent1; + } auto sum = graphNode; auto lastNode = sum; @@ -431,7 +452,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG if (!fuse_allowed) continue; if (graphNode->getChildEdges().size() == 1 && - isFusingSupported(graphNode->getChildEdgeAt(0)->getChild())) { + isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) { auto relu_shared = graphNode->getChildEdgeAt(0)->getChild(); lastNode = relu_shared; mergedConv->setType(Convolution_Sum_Activation); @@ -472,29 +493,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG } } -/** - * Convert LSTM layer format with combined state blob - */ -void MKLDNNGraphOptimizer::SLTMTransform(MKLDNNGraph& graph) { - auto &all_nodes = graph.GetNodes(); - - for (auto &lstm : all_nodes) { - if (lstm->getType() != RNN) - continue; - - auto layer = lstm->getCnnLayer(); - auto in_datas = layer->insData; - auto out_datas = layer->outData; - - if (in_datas.size() == 3) { - assert(lstm->getParentEdges().size() == 3); - // Concatenate 2 states into one blob - // TODO: TBD - } else if ((in_datas.size() != 1)) { - THROW_IE_EXCEPTION << "Unsupported mode for LSTM cell. Expected two state blobs"; - } - } -} void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) { for (MKLDNNNodePtr& node : graph.GetNodes()) { @@ -520,8 +518,11 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) { void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) { std::set<MKLDNNNodePtr> processed; + std::vector<MKLDNNNodePtr> newNodes; for (MKLDNNNodePtr& node : graph.GetNodes()) { - if (processed.find(node) == processed.end() && node->getType() == Reorder && node->getChildEdgeAt(0)->getChild()->getType() == Reorder) { + if (processed.find(node) == processed.end() && node->getType() == Reorder + && node->getChildEdges().size() == 1 + && node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) { auto nextNode = node->getChildEdgeAt(0)->getChild(); MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get()); MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get()); @@ -590,10 +591,13 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) { afterNode->getDesc(); graph.GetEdges().push_back(afterNode); - graph.GetNodes().push_back(newReorder); + newNodes.push_back(newReorder); graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end()); } } + for (MKLDNNNodePtr& node : newNodes) { + graph.GetNodes().push_back(node); + } } void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) { @@ -603,7 +607,7 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) { auto cur = l->insData[0].lock(); if (cur == nullptr) { - THROW_IE_EXCEPTION << "[MKLDNN] shared_ptr l->insData[0].lock() returned nullptr"; + THROW_IE_EXCEPTION << "[MKLDNN] error - invalid input data"; } if (cur->precision != l->outData[0]->precision) { if (node->name.find("_iScaleShift_") != std::string::npos) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h index d6fa323da..6818cc9ae 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp index 338ed7274..95e803925 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp @@ -1,10 +1,10 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // #include "mkldnn_infer_request.h" #include "mkldnn_extension_utils.h" +#include "mkldnn_streams.h" #include <vector> #include <string> #include <map> @@ -36,83 +36,97 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() { if (!graph || !graph->IsReady()) { THROW_IE_EXCEPTION << "Network not loaded."; } - - // execute input pre-processing. - execDataPreprocessing(_inputs); - - changeDefaultPtr(); - // need to retain converted blobs until infer finish - std::vector<InferenceEngine::Blob::Ptr> convertedInputs; - for (auto input : _inputs) { - if (!_networkInputs[input.first]) { - THROW_IE_EXCEPTION << - "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " - << input.first; - } - /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) { - THROW_IE_EXCEPTION << "Different input precision for input " << input.first - << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. " - << _networkInputs[input.first]->getInputPrecision() << " vs " - << input.second->precision(); - }*/ + auto infer = [this] { + // execute input pre-processing. + execDataPreprocessing(_inputs); + + changeDefaultPtr(); + // need to retain converted blobs until infer finish + std::vector<InferenceEngine::Blob::Ptr> convertedInputs; + for (auto input : _inputs) { + if (!_networkInputs[input.first]) { + THROW_IE_EXCEPTION << + "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " + << input.first; + } + /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) { + THROW_IE_EXCEPTION << "Different input precision for input " << input.first + << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. " + << _networkInputs[input.first]->getInputPrecision() << " vs " + << input.second->precision(); + }*/ - InferenceEngine::Blob::Ptr iconv; - InferenceEngine::TBlob<float> *in_f = nullptr; - switch (input.second->precision()) { - case InferenceEngine::Precision::FP32: - pushInput<float>(input.first, input.second); - break; - case InferenceEngine::Precision::U16: - // U16 is unsupported by mkldnn, so here we convert the blob and send FP32 - iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( - InferenceEngine::Precision::FP32, - input.second->getTensorDesc().getLayout(), input.second->dims()); - convertedInputs.push_back(iconv); - iconv->allocate(); - in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); - InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get()); - pushInput<float>(input.first, iconv); - break; - case InferenceEngine::Precision::I16: - if (graph->hasMeanImageFor(input.first)) { - // If a mean image exists, we convert the blob and send FP32 - iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( - InferenceEngine::Precision::FP32, - input.second->getTensorDesc().getLayout(), input.second->dims()); - convertedInputs.push_back(iconv); - iconv->allocate(); - in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); - InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get()); - pushInput<float>(input.first, iconv); - } else { - // Instead we can send I16 directly - pushInput<int16_t>(input.first, input.second); - } - break; - case InferenceEngine::Precision::U8: - if (graph->hasMeanImageFor(input.first)) { - // If a mean image exists, we convert the blob and send FP32 + InferenceEngine::Blob::Ptr iconv; + InferenceEngine::TBlob<float> *in_f = nullptr; + switch (input.second->precision()) { + case InferenceEngine::Precision::FP32: + pushInput<float>(input.first, input.second); + break; + case InferenceEngine::Precision::I32: + pushInput<int32_t>(input.first, input.second); + break; + case InferenceEngine::Precision::I8: + pushInput<int8_t>(input.first, input.second); + break; + case InferenceEngine::Precision::U16: + // U16 is unsupported by mkldnn, so here we convert the blob and send FP32 iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( InferenceEngine::Precision::FP32, input.second->getTensorDesc().getLayout(), input.second->dims()); convertedInputs.push_back(iconv); iconv->allocate(); in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); - InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get()); + InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get()); pushInput<float>(input.first, iconv); - } else { - // Instead we can send I8 directly - pushInput<uint8_t>(input.first, input.second); - } - break; - default: - THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision(); + break; + case InferenceEngine::Precision::I16: + if (graph->hasMeanImageFor(input.first)) { + // If a mean image exists, we convert the blob and send FP32 + iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( + InferenceEngine::Precision::FP32, + input.second->getTensorDesc().getLayout(), input.second->dims()); + convertedInputs.push_back(iconv); + iconv->allocate(); + in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); + InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get()); + pushInput<float>(input.first, iconv); + } else { + // Instead we can send I16 directly + pushInput<int16_t>(input.first, input.second); + } + break; + case InferenceEngine::Precision::U8: + if (graph->hasMeanImageFor(input.first)) { + // If a mean image exists, we convert the blob and send FP32 + iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( + InferenceEngine::Precision::FP32, + input.second->getTensorDesc().getLayout(), input.second->dims()); + convertedInputs.push_back(iconv); + iconv->allocate(); + in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); + InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get()); + pushInput<float>(input.first, iconv); + } else { + // Instead we can send I8 directly + pushInput<uint8_t>(input.first, input.second); + } + break; + default: + THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision(); + } } - } - graph->Infer(m_curBatch); - graph->PullOutputData(_outputs); + graph->Infer(m_curBatch); + graph->PullOutputData(_outputs); + }; +#if IE_THREAD == IE_THREAD_TBB + auto_scope_observing observer(graph->ptrObserver); + // a TBB arena is made "this" for Infer call via executing lambda for the arena + graph->ptrArena->execute([&] { infer(); }); +#else + infer(); +#endif } void MKLDNNPlugin::MKLDNNInferRequest::GetPerformanceCounts( diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h index 313c4e045..6d88bc8d2 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp index ebbd86422..1821b88f8 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -169,8 +168,22 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) { case f::OhIw16o4i: case f::OIhw4i16o4i: ndims = 4; break; - case f::goihw: + // DHW + case f::ncdhw: + case f::ndhwc: + case f::nCdhw8c: + case f::nCdhw16c: + case f::oidhw: + case f::OIdhw8i8o: + case f::OIdhw16i16o: + case f::OIdhw8o8i: + case f::OIdhw16o16i: + case f::OIdhw8i16o2i: + case f::Odhwi8o: + case f::Odhwi16o: + // Group HW case f::hwigo: + case f::goihw: case f::gOIhw8i8o: case f::gOIhw16i16o: case f::gOIhw8i16o2i: @@ -183,6 +196,15 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) { case f::Goihw8g: case f::Goihw16g: ndims = 5; break; + case f::goidhw: + case f::gOIdhw8i8o: + case f::gOIdhw16i16o: + case f::gOIdhw8i16o2i: + case f::gOdhwi8o: + case f::gOdhwi16o: + case f::gOIdhw8o8i: + case f::gOIdhw16o16i: + ndims = 6; break; case f::format_undef: ndims = 0; break; case f::any: @@ -197,8 +219,8 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) { } bool MKLDNNMemory::IsPlainFormat(memory::format format) { - std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::nhwc, memory::chwn, - memory::oi, memory::io, memory::oihw, memory::ihwo, + std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::ncdhw, memory::nhwc, memory::ndhwc, memory::chwn, + memory::oi, memory::io, memory::oihw, memory::oidhw, memory::ihwo, memory::goihw, memory::blocked}; @@ -217,13 +239,28 @@ memory::format MKLDNNMemory::GetPlainFormat(memory::dims dims) { return memory::x; case 2: return memory::nc; + case 3: + return memory::tnc; case 4: return memory::nchw; + case 5: + return memory::ncdhw; default: return memory::blocked; } } +InferenceEngine::Layout MKLDNNMemory::GetPlainLayout(memory::dims dims) { + switch (dims.size()) { + case 1: return Layout::C; + case 2: return Layout::NC; + case 3: return Layout::CHW; + case 4: return Layout::NCHW; + default: + return Layout::BLOCKED; + } +} + void MKLDNNMemory::CreateBlockingDesc(memory::desc &desc) { auto dims = desc.data.dims; int ndims = desc.data.ndims; @@ -262,6 +299,10 @@ memory::format MKLDNNMemory::Convert(const InferenceEngine::Layout layout) { return memory::nchw; case NHWC: return memory::nhwc; + case NCDHW: + return memory::ncdhw; + case NDHWC: + return memory::ndhwc; case CHW: return memory::tnc; case NC: @@ -294,6 +335,11 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) { case memory::nChw8c: return "nChw8c"; case memory::nChw16c: return "nChw16c"; + case memory::ncdhw: return "ncdhw"; + case memory::ndhwc: return "ndhwc"; + case memory::nCdhw8c: return "nCdhw8c"; + case memory::nCdhw16c: return "nCdhw16c"; + case memory::oihw: return "oihw"; case memory::ihwo: return "ihwo"; case memory::OIhw8i8o: return "OIhw8i8o"; @@ -306,8 +352,18 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) { case memory::Ohwi16o: return "Ohwi16o"; case memory::OhIw16o4i: return "OhIw16o4i"; + case memory::oidhw: return "oidhw"; + case memory::OIdhw8i8o: return "OIdhw8i8o"; + case memory::OIdhw16i16o: return "OIdhw16i16o"; + case memory::OIdhw8o8i: return "OIdhw8o8i"; + case memory::OIdhw16o16i: return "OIdhw16o16i"; + case memory::OIdhw8i16o2i: return "OIdhw8i16o2i"; + case memory::Odhwi8o: return "Odhwi8o"; + case memory::Odhwi16o: return "Odhwi16o"; + case memory::goihw: return "goihw"; case memory::hwigo: return "hwigo"; + case memory::hwio: return "hwio"; case memory::gOIhw8i8o: return "gOIhw8i8o"; case memory::gOIhw16i16o: return "gOIhw16i16o"; case memory::gOIhw8i16o2i: return "gOIhw8i16o2i"; @@ -317,6 +373,16 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) { case memory::gOIhw8o8i: return "gOIhw8o8i"; case memory::gOIhw16o16i: return "gOIhw16o16i"; case memory::gOhIw16o4i: return "gOhIw16o4i"; + + case memory::goidhw: return "goidhw"; + case memory::gOIdhw8i8o: return "gOIdhw8i8o"; + case memory::gOIdhw16i16o: return "gOIdhw16i16o"; + case memory::gOIdhw8i16o2i: return "gOIdhw8i16o2i"; + case memory::gOdhwi8o: return "gOdhwi8o"; + case memory::gOdhwi16o: return "gOdhwi16o"; + case memory::gOIdhw8o8i: return "gOIdhw8o8i"; + case memory::gOIdhw16o16i: return "gOIdhw16o16i"; + default: { THROW_IE_EXCEPTION << "Unknown data format."; } @@ -400,66 +466,96 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const { auto blkInfo = desc.data.layout_desc.blocking; auto offset = static_cast<size_t>(blkInfo.offset_padding); SizeVector offsetsForDims; + SizeVector dims = getDims().ToSizeVector(); switch (getFormat()) { case memory::format_undef: THROW_IE_EXCEPTION << "Cannot cast to tensor desc. Format is undefined!"; case memory::any: layout = Layout::ANY; - return TensorDesc(precision, getDims().ToSizeVector(), layout); + return TensorDesc(precision, dims, layout); case memory::x: layout = Layout::C; order = {0}; - blkDims = getDims().ToSizeVector(); + blkDims = dims; break; case memory::oi: case memory::nc: layout = Layout::NC; order = {0, 1}; - blkDims = getDims().ToSizeVector(); + blkDims = dims; break; case memory::tnc: layout = Layout::CHW; order = {0, 1, 2}; - blkDims = getDims().ToSizeVector(); + blkDims = dims; break; case memory::ntc: layout = Layout::CHW; order = {1, 0, 2}; - blkDims = {static_cast<size_t>(getDims()[1]), - static_cast<size_t>(getDims()[0]), - static_cast<size_t>(getDims()[2])}; + blkDims = {static_cast<size_t>(dims[1]), + static_cast<size_t>(dims[0]), + static_cast<size_t>(dims[2])}; break; case memory::oihw: case memory::nchw: layout = Layout::NCHW; order = {0, 1, 2, 3}; - blkDims = getDims().ToSizeVector(); + blkDims = dims; + break; + case memory::ncdhw: + layout = Layout::NCDHW; + order = {0, 1, 2, 3, 4}; + blkDims = dims; break; case memory::nhwc: layout = Layout::NHWC; order = {0, 2, 3, 1}; - blkDims = {static_cast<size_t>(getDims()[0]), - static_cast<size_t>(getDims()[2]), - static_cast<size_t>(getDims()[3]), - static_cast<size_t>(getDims()[1])}; + blkDims = {static_cast<size_t>(dims[0]), + static_cast<size_t>(dims[2]), + static_cast<size_t>(dims[3]), + static_cast<size_t>(dims[1])}; break; + case memory::ndhwc: + layout = Layout::NDHWC; + order = {0, 2, 3, 4, 1}; + blkDims = {static_cast<size_t>(dims[0]), + static_cast<size_t>(dims[2]), + static_cast<size_t>(dims[3]), + static_cast<size_t>(dims[4]), + static_cast<size_t>(dims[1])}; + break; + case memory::oIhw8i: case memory::nChw8c: order = {0, 1, 2, 3, 1}; - blkDims = getDims().ToSizeVector(); + blkDims = dims; + blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0); + blkDims.push_back(8); + layout = Layout::BLOCKED; + break; + case memory::nCdhw8c: + order = {0, 1, 2, 3, 4, 1}; + blkDims = dims; blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0); blkDims.push_back(8); layout = Layout::BLOCKED; break; case memory::nChw16c: order = {0, 1, 2, 3, 1}; - blkDims = getDims().ToSizeVector(); + blkDims = dims; + blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0); + blkDims.push_back(16); + layout = Layout::BLOCKED; + break; + case memory::nCdhw16c: + order = {0, 1, 2, 3, 4, 1}; + blkDims = dims; blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0); blkDims.push_back(16); layout = Layout::BLOCKED; break; case memory::blocked: order.clear(); - blkDims = getDims().ToSizeVector(); + blkDims = dims; for (size_t i = 0; i < blkDims.size(); i++) { order.push_back(i); if ((i && blkInfo.strides[0][i - 1] < blkInfo.strides[0][i]) || blkInfo.block_dims[i] != 1) { @@ -478,14 +574,14 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const { SizeVector strides(blkDims.size()); - if (layout == Layout::NHWC || layout == Layout::CHW) { + if (layout == Layout::NHWC || layout == Layout::NDHWC || layout == Layout::CHW) { for (size_t i = 0; i < order.size(); i++) { strides[i] = static_cast<size_t>(blkInfo.strides[0][order[i]]); } } else { strides[blkDims.size() - 1] = 1; for (size_t i = 2; i <= order.size(); i++) { - if (blkDims.size() - i < getDims().ndims()) { + if (blkDims.size() - i < dims.size()) { strides[blkDims.size() - i] = static_cast<size_t>(blkInfo.strides[0][order[blkDims.size() - i]]); } else { strides[blkDims.size() - i] = strides[blkDims.size() - i + 1] * blkDims[blkDims.size() - i + 1]; @@ -494,13 +590,13 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const { } for (size_t i = 0; i < blkDims.size() && i < TENSOR_MAX_DIMS; i++) { - if (i < getDims().ndims()) + if (i < dims.size()) offsetsForDims.push_back(blkInfo.offset_padding_to_data[i]); else offsetsForDims.push_back(0); } - TensorDesc tensorDesc(precision, getDims().ToSizeVector(), {blkDims, order, offset, offsetsForDims, strides}); + TensorDesc tensorDesc(precision, dims, {blkDims, order, offset, offsetsForDims, strides}); tensorDesc.setLayout(layout); return tensorDesc; @@ -543,9 +639,15 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): case NCHW: mkldnnFormat = memory::format::nchw; break; + case NCDHW: + mkldnnFormat = memory::format::ncdhw; + break; case NHWC: mkldnnFormat = memory::format::nhwc; break; + case NDHWC: + mkldnnFormat = memory::format::ndhwc; + break; case OIHW: mkldnnFormat = memory::format::oihw; break; @@ -553,6 +655,11 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): mkldnnFormat = memory::format::x; break; case CHW: + if (order == SizeVector{0, 1, 2}) + mkldnnFormat = memory::format::tnc; + else if (order == SizeVector{1, 0, 2}) + mkldnnFormat = memory::format::ntc; + else mkldnnFormat = memory::format::blocked; break; case HW: @@ -560,32 +667,41 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc): mkldnnFormat = memory::format::nc; break; case BLOCKED: + mkldnnFormat = memory::format::blocked; if (realDims.ndims() == 1) { mkldnnFormat = memory::format::x; - break; } else if (realDims.ndims() == 2) { mkldnnFormat = memory::format::nc; - break; } else if (realDims.ndims() == 4) { if (order.size() == 5 && order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 1) { if (blkdDims[4] == 8) { mkldnnFormat = memory::format::nChw8c; - break; } else if (blkdDims[4] == 16) { mkldnnFormat = memory::format::nChw16c; - break; } } else if (order.size() == 4) { if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3) { mkldnnFormat = memory::format::nchw; - break; } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 1) { mkldnnFormat = memory::format::nhwc; - break; + } + } + } else if (realDims.ndims() == 5) { + if (order.size() == 6 && + order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4 && order[5] == 1) { + if (blkdDims[5] == 8) { + mkldnnFormat = memory::format::nCdhw8c; + } else if (blkdDims[5] == 16) { + mkldnnFormat = memory::format::nCdhw16c; + } + } else if (order.size() == 5) { + if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4) { + mkldnnFormat = memory::format::ncdhw; + } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 4 && order[4] == 1) { + mkldnnFormat = memory::format::ndhwc; } } } - mkldnnFormat = memory::format::blocked; break; case CN: mkldnnFormat = memory::format::blocked; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h index a5329ee5f..37578e5ff 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -108,6 +107,7 @@ public: static bool IsPlainFormat(mkldnn::memory::format format); static mkldnn::memory::format GetPlainFormat(mkldnn::memory::dims dims); + static InferenceEngine::Layout GetPlainLayout(mkldnn::memory::dims dims); static bool isConsistant(mkldnn::memory::dims dims, mkldnn::memory::format format); static mkldnn::memory::format Convert(const InferenceEngine::Layout layout); diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index 7bda59d0d..73975b71e 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +9,8 @@ #include <vector> #include <string> #include <limits> +#include <cstdint> +#include <unordered_map> #include <nodes/mkldnn_batchnorm_node.h> #include <nodes/mkldnn_concat_node.h> @@ -17,6 +18,7 @@ #include <nodes/mkldnn_crop_node.h> #include <nodes/mkldnn_deconv_node.h> #include <nodes/mkldnn_eltwise_node.h> +#include <nodes/mkldnn_gemm_node.h> #include <nodes/mkldnn_fullyconnected_node.h> #include <nodes/mkldnn_generic_node.h> #include <nodes/mkldnn_input_node.h> @@ -35,8 +37,9 @@ #include <nodes/mkldnn_memory_node.hpp> #include <nodes/mkldnn_rnn.h> #include <mkldnn_types.h> - #include "mkldnn_extension_utils.h" +#include "mkldnn_plugin.h" +#include "ie_memcpy.h" using namespace mkldnn; using namespace MKLDNNPlugin; @@ -52,6 +55,7 @@ MKLDNNNode::Register<MKLDNNConvolutionNode> MKLDNNConvolutionNode::reg; MKLDNNNode::Register<MKLDNNCropNode> MKLDNNCropNode::reg; MKLDNNNode::Register<MKLDNNDeconvolutionNode> MKLDNNDeconvolutionNode::reg; MKLDNNNode::Register<MKLDNNEltwiseNode> MKLDNNEltwiseNode::reg; +MKLDNNNode::Register<MKLDNNGemmNode> MKLDNNGemmNode::reg; MKLDNNNode::Register<MKLDNNFullyConnectedNode> MKLDNNFullyConnectedNode::reg; MKLDNNNode::Register<MKLDNNInputNode> MKLDNNInputNode::reg; MKLDNNNode::Register<MKLDNNLrnNode> MKLDNNLrnNode::reg; @@ -358,6 +362,8 @@ std::vector<memory::format> MKLDNNNode::getAvailableFormatsForDims(const MKLDNND return {memory::format::nc}; else if (dims.ndims() == 4) return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c}; + else if (dims.ndims() == 5) + return {memory::format::ncdhw, memory::format::nCdhw8c, memory::format::nCdhw16c}; return {memory::format::any}; } @@ -506,7 +512,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV size_t offset = blb->byteSize(); checkSize(intBuffSize, offset); - memcpy(data, blb->buffer(), blb->byteSize()); + ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize()); data += blb->byteSize(); for (const auto &merged : getMergeWith()) { wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(merged->getCnnLayer().get()); @@ -519,7 +525,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV THROW_IE_EXCEPTION << "Cannot get internal blob layer for node " << getName() << "."; offset += blb->byteSize(); checkSize(intBuffSize, offset); - memcpy(data, blb->buffer(), blb->byteSize()); + ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize()); data += blb->byteSize(); } @@ -545,13 +551,32 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri internalBlobMemory.clear(); for (size_t i = 0; i < internalBlobs.size(); i++) { - auto& internalBlob = internalBlobs[i]; - internalBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(engine))); - - internalBlobMemory[i]->Create(intDescs[i]); - MKLDNNMemory memory(engine); - memory.Create(MKLDNNMemoryDesc(internalBlob->getTensorDesc()), internalBlob->buffer()); - internalBlobMemory[i]->SetData(memory); + const auto &internalBlob = internalBlobs[i]; + + const uint64_t data_hash = Engine::GetWeightsSharing().GetHashFunc().hash(internalBlob->buffer(), internalBlob->byteSize()); + const std::string string_hash = name + "_" + std::to_string(i) + + "_" + std::to_string(internalBlob->byteSize()) + + "_" + std::to_string(data_hash); + MKLDNNMemoryPtr ptr = + Engine::GetWeightsSharing().findOrCreate(string_hash, [&] () { + MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(engine)); + _ptr->Create(intDescs[i]); + MKLDNNMemory memory(engine); + + auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc()); + auto newFormat = newDesc.getFormat(); + if (newFormat == mkldnn::memory::ncdhw) { + newFormat = mkldnn::memory::goihw; + } + if (newFormat == mkldnn::memory::nchw) { + newFormat = mkldnn::memory::oihw; + } + memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer()); + auto aformat = memory.GetFormat(); + _ptr->SetData(memory); + return _ptr; + }); + internalBlobMemory.push_back(ptr); } } @@ -648,6 +673,8 @@ std::string MKLDNNNode::typeToStr(Type type) { return "Pooling"; case FullyConnected: return "FullyConnected"; + case Gemm: + return "Gemm"; case SoftMax: return "SoftMax"; case Split: @@ -682,6 +709,9 @@ std::string MKLDNNNode::typeToStr(Type type) { return "MemoryInput"; case RNN: return "RNN"; + case LSTMCell: + return "LSTMCell"; + default: return "Unknown"; } @@ -838,17 +868,18 @@ InferenceEngine::TensorDesc MKLDNNNode::getConfiguredOutputDesc(const InferenceE void MKLDNNNode::initOptimalPrimitiveDescriptor() { auto config = getSelectedPrimitiveDescriptor()->getConfig(); - if (isInitConfig(config)) - return; - - for (size_t i = 0; i < config.inConfs.size(); i++) { - config.inConfs[i].desc = getConfiguredInputDesc(config, i); - } + if (!isInitConfig(config)) { + for (size_t i = 0; i < config.inConfs.size(); i++) { + config.inConfs[i].desc = getConfiguredInputDesc(config, i); + } - for (size_t i = 0; i < config.outConfs.size(); i++) { - config.outConfs[i].desc = getConfiguredOutputDesc(config, i); + for (size_t i = 0; i < config.outConfs.size(); i++) { + config.outConfs[i].desc = getConfiguredOutputDesc(config, i); + } + initDescriptor(config); + } else if (getType() != RNN && getType() != LSTMCell) { + initDescriptor(config); } - initDescriptor(config); } bool MKLDNNNode::isInitConfig(const InferenceEngine::LayerConfig& config) const { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index acfe8e167..fe71c665f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -49,6 +48,7 @@ enum Type { Concatenation, Power, Eltwise, + Gemm, Crop, Reshape, Tile, @@ -60,6 +60,7 @@ enum Type { Copy, MemoryOutput, MemoryInput, + LSTMCell, RNN }; @@ -86,6 +87,7 @@ static Type TypeFromName(const std::string type) { { "Pooling", Pooling }, { "FullyConnected", FullyConnected }, { "InnerProduct", FullyConnected }, + { "Gemm", Gemm }, { "Softmax", SoftMax }, { "SoftMax", SoftMax }, { "Split", Split }, @@ -103,6 +105,7 @@ static Type TypeFromName(const std::string type) { { "Flatten", Flatten }, { "Permute", Permute }, { "Copy", Copy }, + { "LSTMCell", LSTMCell }, { "RNN", RNN }, { "MemoryInput", MemoryInput}, // for construction from name ctor, arbitrary name is used { "Memory", MemoryOutput }, // for construction from layer ctor @@ -191,6 +194,10 @@ public: return mergedWith; } + const std::vector <MKLDNNNodePtr> &getFusedWith() { + return fusedWith; + } + const std::string getName() const { return name; } @@ -317,7 +324,7 @@ protected: this->type = type; } - int getMaxBatch(); + virtual int getMaxBatch(); virtual InferenceEngine::TensorDesc getConfiguredInputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const; virtual InferenceEngine::TensorDesc getConfiguredOutputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const; @@ -350,6 +357,8 @@ protected: MKLDNNPrimitive prim; std::vector<MKLDNNDescriptor> descs; + InferenceEngine::Blob::Ptr ext_scales; + friend class MKLDNNEdge; friend class MKLDNNGraph; friend class MKLDNNGraphOptimizer; @@ -371,8 +380,9 @@ protected: public: Register() { Registry::RegisterNode( - Registry::CreatorByLayerFunction([](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) -> MKLDNNNode * { - return new To(layer, eng); } ) ); + Registry::CreatorByLayerFunction( + [](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) + -> MKLDNNNode* { return new To(layer, eng); } ) ); } }; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp index 3b51c974f..35a965afa 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +10,9 @@ using namespace MKLDNNPlugin; using namespace InferenceEngine; +MKLDNNWeightsSharing Engine::weightsSharing; +const SimpleDataHash MKLDNNWeightsSharing::simpleCRC; + InferenceEngine::ExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config) { auto specifiedDevice = network.getTargetDevice(); @@ -25,8 +27,12 @@ Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map network.getInputsInfo(_networkInputs); for (auto ii : _networkInputs) { auto input_precision = ii.second->getInputPrecision(); - if (input_precision != InferenceEngine::Precision::U16 && input_precision != InferenceEngine::Precision::I16 - && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8) { + if (input_precision != InferenceEngine::Precision::FP32 && + input_precision != InferenceEngine::Precision::I32 && + input_precision != InferenceEngine::Precision::U16 && + input_precision != InferenceEngine::Precision::I16 && + input_precision != InferenceEngine::Precision::I8 && + input_precision != InferenceEngine::Precision::U8) { THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str << "Input image format " << input_precision << " is not supported yet..."; } @@ -86,7 +92,7 @@ void Engine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin*& plugin, ResponseDesc *resp) noexcept { try { plugin = make_ie_compatible_plugin( - {{1, 4}, + {{1, 5}, #ifdef MKL_VERSION MKL_VERSION, #else diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h index 482405a16..383feaa21 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -8,11 +7,59 @@ #include "mkldnn_graph.h" #include <string> #include <map> +#include <unordered_map> #include <memory> +#include <functional> #include <cpp_interfaces/impl/ie_plugin_internal.hpp> namespace MKLDNNPlugin { +class SimpleDataHash { +public: + SimpleDataHash() { + for (int i = 0; i < kTableSize; i++) { + uint64_t c = i; + for (int j = 0; j < 8; j++) + c = ((c & 1) ? 0xc96c5795d7870f42 : 0) ^ (c >> 1); + table[i] = c; + } + } + // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182 + uint64_t hash(const unsigned char* data, size_t size) const { + uint64_t crc = 0; + for (size_t idx = 0; idx < size; idx++) + crc = table[(unsigned char)crc ^ data[idx]] ^ (crc >> 8); + + return ~crc; + } + +protected: + static const int kTableSize = 256; + uint64_t table[kTableSize]; +}; + +class MKLDNNWeightsSharing { +public: + MKLDNNMemoryPtr findOrCreate(const std::string& name_hash, + std::function<MKLDNNMemoryPtr(void)> create) { + std::unique_lock<std::mutex> lock(guard); + auto found = sharedWeights.find(name_hash); + + MKLDNNMemoryPtr ptr; + if (found == sharedWeights.end() || !(ptr = found->second.lock())) { + ptr = create(); + sharedWeights[name_hash] = ptr; + } + return ptr; + } + static const SimpleDataHash& GetHashFunc () { return simpleCRC; } + +protected: + std::unordered_map<std::string, std::weak_ptr<MKLDNNMemory>> sharedWeights; + std::mutex guard; + static const SimpleDataHash simpleCRC; +}; + class Engine : public InferenceEngine::InferencePluginInternal { public: Engine() = default; @@ -30,16 +77,20 @@ public: void SetConfig(const std::map<std::string, std::string> &config) override; /** - * @depricated Use the version with config parameter + * @deprecated Use the version with config parameter */ void QueryNetwork(const InferenceEngine::ICNNNetwork& network, InferenceEngine::QueryNetworkResult& res) const override; void QueryNetwork(const InferenceEngine::ICNNNetwork& network, const std::map<std::string, std::string>& config, InferenceEngine::QueryNetworkResult& res) const override; + static MKLDNNWeightsSharing& GetWeightsSharing() { return weightsSharing; } private: Config engConfig; MKLDNNExtensionManager::Ptr extensionManager = std::make_shared<MKLDNNExtensionManager>(); + +protected: + static MKLDNNWeightsSharing weightsSharing; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp index 6fa73c52f..f9e59f2cc 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h index 5bf983496..075afff9e 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp new file mode 100644 index 000000000..a5198377c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp @@ -0,0 +1,372 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <string> +#include <map> +#include <vector> +#include <limits> +#include <chrono> +#include <climits> +#include <memory> + +#include "mkldnn_graph.h" +#include "ie_parallel.hpp" +#include "mkldnn_streams.h" + +using namespace mkldnn; +using namespace MKLDNNPlugin; +using namespace InferenceEngine; +using namespace InferenceEngine::details; + +namespace MKLDNNPlugin { + +thread_local MultiWorkerTaskContext MultiWorkerTaskExecutor::ptrContext; + +bool check_env_variables() { +#if IE_THREAD == IE_THREAD_OMP + return MKLDNNPlugin::cpu::checkOpenMpEnvVars(false); +#else + return false; +#endif +} + +#if !(defined(__APPLE__) || defined(_WIN32)) +/* Get the cores affinity mask for the current process */ +bool get_process_mask(int& ncpus, cpu_set_t*& mask) { + for (ncpus = sizeof(cpu_set_t) / CHAR_BIT; ncpus < 1024 /* reasonable limit of #cores*/; ncpus <<= 1) { + mask = CPU_ALLOC(ncpus); + if (!mask) return false; + + const size_t size = CPU_ALLOC_SIZE(ncpus); + CPU_ZERO_S(size, mask); + const int err = sched_getaffinity(getpid(), size, mask); + // the result fits the mask + if (!err) break; + // mask size is not enough + CPU_FREE(mask); + mask = NULL; + // other error + if (errno != EINVAL) break; + } + if (!mask) { + return false; + } + return true; +} +/* Pin current thread to a set of cores determined by the mask. */ +bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) { + return 0 == sched_setaffinity(0, ncores, proc_mask); +} +/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask. + * The function can also handle the hyper-threading (by populating the physical cores first) */ +bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) { + const size_t size = CPU_ALLOC_SIZE(ncores); + const int num_cpus = CPU_COUNT_S(size, proc_mask); + thr_idx %= num_cpus; // To limit unique number in [; num_cpus-1] range + + // Place threads with specified step + int cpu_idx = 0; + for (int i = 0, offset = 0; i < thr_idx; ++i) { + cpu_idx += hyperthreads; + if (cpu_idx >= num_cpus) + cpu_idx = ++offset; + } + + // Find index of 'cpu_idx'-th bit that equals to 1 + int mapped_idx = -1; + while (cpu_idx >= 0) { + if (CPU_ISSET_S(++mapped_idx, size, proc_mask)) + --cpu_idx; + } + + cpu_set_t *target_mask = CPU_ALLOC(ncores); + CPU_ZERO_S(size, target_mask); + CPU_SET_S(mapped_idx, size, target_mask); + bool res = pin_current_thread_by_mask(size, target_mask); + CPU_FREE(target_mask); + return res; +} +#else // no threads pinning/binding on Win/MacOS +bool get_process_mask(int& ncpus, cpu_set_t*& mask) { + ncpus = 0; + mask = nullptr; + return false; +} +bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) { + return false; +} +bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) { + return false; +} +#endif // !(defined(__APPLE__) || defined(_WIN32)) + +MultiWorkerTaskExecutor::MultiWorkerTaskExecutor(const std::vector<Task::Ptr>& init_tasks, std::string name) : + _isStopped(false), _name(name), _initCount(0) { + for (auto t : init_tasks) { + _threads.push_back(std::thread([&, t] { + // initialization (no contention, every worker thread is doing it's own task) + t->runNoThrowNoBusyCheck(); + _initCount++; + + while (!_isStopped) { + bool isQueueEmpty; + Task::Ptr currentTask = nullptr; + { // waiting for the new task or for stop signal + std::unique_lock<std::mutex> lock(_queueMutex); + _queueCondVar.wait(lock, [&]() { return !_taskQueue.empty() || _isStopped; }); + isQueueEmpty = _taskQueue.empty(); + if (!isQueueEmpty) { + currentTask = _taskQueue.front(); + _taskQueue.pop(); + isQueueEmpty = _taskQueue.empty(); + } + } + if (currentTask) + currentTask->runNoThrowNoBusyCheck(); + if (_isStopped) + break; + if (isQueueEmpty) // notify dtor, that all tasks were completed + _queueCondVar.notify_all(); + } + })); + } + while (_initCount != init_tasks.size()) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +MultiWorkerTaskExecutor::~MultiWorkerTaskExecutor() { + { + std::unique_lock<std::mutex> lock(_queueMutex); + if (!_taskQueue.empty()) { + _queueCondVar.wait(lock, [this]() { return _taskQueue.empty(); }); + } + _isStopped = true; + _queueCondVar.notify_all(); + } + for (auto& thread : _threads) { + if (thread.joinable()) { + thread.join(); + } + } +} + +bool MultiWorkerTaskExecutor::startTask(Task::Ptr task) { + if (!task->occupy()) return false; + std::unique_lock<std::mutex> lock(_queueMutex); + _taskQueue.push(task); + _queueCondVar.notify_one(); + return true; +} + +MKLDNNPlugin::MKLDNNGraphlessInferRequest::MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs, + InferenceEngine::OutputsDataMap networkOutputs) + : InferRequestInternal(networkInputs, networkOutputs), m_curBatch(-1) { + // Allocate all input blobs + for (const auto& it : networkInputs) { + InferenceEngine::Blob::Ptr blob; + GetBlob(it.first.c_str(), blob); + } + // Allocate all output blobs + for (const auto& it : networkOutputs) { + InferenceEngine::Blob::Ptr blob; + GetBlob(it.first.c_str(), blob); + } +} + + +void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() { + IE_PROFILING_AUTO_SCOPE(MKLDNN_INFER) + + auto infer = [this] { + IE_ASSERT(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph != nullptr); + MKLDNNGraph::Ptr graph = MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph; + if (!graph->IsReady()) + THROW_IE_EXCEPTION << "Network not loaded."; + if (m_curBatch > 0 && !graph->getProperty().enableDynamicBatch) + THROW_IE_EXCEPTION << "Dynamic batch is not enabled."; + + if (m_curBatch > graph->getProperty().batchLimit) + THROW_IE_EXCEPTION << "Invalid dynamic batch size " << m_curBatch << + " for this request."; + + // execute input pre-processing. + execDataPreprocessing(_inputs); + + // need to retain converted blobs until infer finish + std::vector<InferenceEngine::Blob::Ptr> convertedInputs; + for (auto input : _inputs) { + if (!_networkInputs[input.first]) { + THROW_IE_EXCEPTION << + "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " + << input.first; + } + InferenceEngine::Blob::Ptr iconv; + InferenceEngine::TBlob<float> *in_f = nullptr; + switch (input.second->precision()) { + case InferenceEngine::Precision::FP32: + graph->PushInputData(input.first, input.second); + break; + case InferenceEngine::Precision::U16: + // U16 is unsupported by mkldnn, so here we convert the blob and send FP32 + iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( + InferenceEngine::Precision::FP32, + input.second->getTensorDesc().getLayout(), input.second->dims()); + convertedInputs.push_back(iconv); + iconv->allocate(); + in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); + InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get()); + graph->PushInputData(input.first, iconv); + break; + case InferenceEngine::Precision::I16: + if (graph->hasMeanImageFor(input.first)) { + // If a mean image exists, we convert the blob and send FP32 + iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( + InferenceEngine::Precision::FP32, + input.second->getTensorDesc().getLayout(), input.second->dims()); + convertedInputs.push_back(iconv); + iconv->allocate(); + in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); + InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get()); + graph->PushInputData(input.first, iconv); + } else { + // Instead we can send I16 directly + graph->PushInputData(input.first, input.second); + } + break; + case InferenceEngine::Precision::U8: + if (graph->hasMeanImageFor(input.first)) { + // If a mean image exists, we convert the blob and send FP32 + iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>( + InferenceEngine::Precision::FP32, + input.second->getTensorDesc().getLayout(), input.second->dims()); + convertedInputs.push_back(iconv); + iconv->allocate(); + in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get()); + InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get()); + graph->PushInputData(input.first, iconv); + } else { + // Instead we can send I8 directly + graph->PushInputData(input.first, input.second); + } + break; + default: + THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision(); + } + } + graph->Infer(m_curBatch); + graph->PullOutputData(_outputs); + if (graph->getProperty().collectPerfCounters) { + m_perfMap.clear(); + graph->GetPerfData(m_perfMap); + } + }; +#if IE_THREAD == IE_THREAD_TBB + auto_scope_observing observer(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrObserver); + // a TBB arena is made "this" for Infer call via executing lambda for the arena + MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrArena->execute([&] { infer(); }); +#else + infer(); +#endif +} + +void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetPerformanceCounts( + std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const { + perfMap = m_perfMap; +} + +void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) { + // ROI blob is returned only if it was set previously. + auto it = _preProcData.find(name); + if (it != _preProcData.end()) { + data = it->second.getRoiBlob(); + return; + } + + if (_inputs.find(name) != _inputs.end()) { + data = _inputs[name]; + checkBlob(data, name, true); + return; + } else if (_networkInputs.find(name) != _networkInputs.end()) { + InferenceEngine::Layout l = _networkInputs[name]->getLayout(); + InferenceEngine::Precision p = _networkInputs[name]->getPrecision(); + InferenceEngine::SizeVector dims = _networkInputs[name]->getTensorDesc().getDims(); + + InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l); + _inputs[name] = data = make_blob_with_precision(desc); + _inputs[name]->allocate(); + checkBlob(data, name, true); + return; + } + + if (_outputs.find(name) != _outputs.end()) { + data = _outputs[name]; + checkBlob(data, name, false); + return; + } else if (_networkOutputs.find(name) != _networkOutputs.end()) { + InferenceEngine::Layout l = _networkOutputs[name]->getLayout(); + InferenceEngine::Precision p = _networkOutputs[name]->getPrecision(); + InferenceEngine::SizeVector dims = _networkOutputs[name]->getTensorDesc().getDims(); + + InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l); + _outputs[name] = data = make_blob_with_precision(desc); + _outputs[name]->allocate(); + checkBlob(data, name, false); + return; + } + + THROW_IE_EXCEPTION << "Cannot find blob with name: " << name; +} + +void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) { + if (!data) + THROW_IE_EXCEPTION << NOT_ALLOCATED_str << "Failed to set empty blob with name: \'" << name << "\'"; + if (data->buffer() == nullptr) + THROW_IE_EXCEPTION << "Input data was not allocated. Input name: \'" << name << "\'"; + if (name == nullptr) { + THROW_IE_EXCEPTION << NOT_FOUND_str + "Failed to set blob with empty name"; + } + InferenceEngine::InputInfo::Ptr foundInput; + InferenceEngine::DataPtr foundOutput; + size_t dataSize = data->size(); + if (findInputAndOutputBlobByName(name, foundInput, foundOutput)) { + if (foundInput->getInputPrecision() != data->precision()) { + THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Failed to set Blob with precision " + << data->precision(); + } + + if (foundInput->getPreProcess().getResizeAlgorithm() != InferenceEngine::ResizeAlgorithm::NO_RESIZE) { + // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing. + _preProcData[name].setRoiBlob(data); + } else { + size_t inputSize = InferenceEngine::details::product(foundInput->getDims()); + if (dataSize != inputSize) { + THROW_IE_EXCEPTION << "Input blob size is not equal network input size (" + << dataSize << "!=" << inputSize << ")."; + } + _inputs[name] = data; + } + } else { + size_t outputSize = InferenceEngine::details::product(foundOutput->getDims()); + if (dataSize != outputSize) { + THROW_IE_EXCEPTION << "Output blob size is not equal network output size (" + << dataSize << "!=" << outputSize << ")."; + } + if (foundOutput->getPrecision() != data->precision()) { + THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str + << "Failed to set Blob with precision not corresponding to user output precision"; + } + _outputs[name] = data; + } +} + +void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBatch(int new_batch) { + if (new_batch < 1) { + THROW_IE_EXCEPTION << "Invalid dynamic batch size " << new_batch << + " for this request."; + } + m_curBatch = new_batch; +} + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.h b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h new file mode 100644 index 000000000..31558fee2 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h @@ -0,0 +1,177 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include <string> +#include <vector> +#include <atomic> +#include <map> +#include <queue> +#include <memory> +#include <climits> +#include <cpp_interfaces/impl/ie_infer_request_internal.hpp> +#include <cpp_interfaces/ie_task_executor.hpp> +#include "ie_parallel.hpp" +#include "mkldnn/omp_manager.h" + +/* CPU "streams" implement a feature that allows multiple Infer Requests to be efficiently run simultaneously. + * To avoid potential oversubscription the CPU execution resources are divided accordingly. + * The feature enables much better performance for the networks that originally do not scale well with #threads + * even for a large batches. Examples are lightweight topologies or topologies with many sequential/mem-bound/etc or + * otherwise non-scalable layers. This is especially pronounced for many-core (e.g. server) machines. + * This is rather throughput-oriented feature,because running multiple requests in parallel might increase the latency + * of each request. + * Additionally, the streams help to relax the need for the large batch to improve the throughput and simplify the + * application logic, helping to saturate the CPU by multiple requests instead. + * Implementation-wise, the "streams" constitute the following: + * - Pure "graph-less" Infer Requests that are not connected to the specific MKLDNNGraph (which is regular/legacy approach) + * - Just like regular requests, the graph-less go to the common (per ExecutableNetwork) queue + * - But unlike conventional case, there are multiple threads that grab the requests (see MultiWorkerTaskExecutor) + * - So every stream is in fact is independent "worker" thread that monitors the queue. + * - Every worker thread (stream) has it's own copy of the graph (which handles intermediate data required for execution) + * - While the Infer Requests just keep only input/output data +*/ +namespace MKLDNNPlugin { + +using namespace InferenceEngine; +class MKLDNNGraph; +class pinning_observer; + +/* This structure handles an "execution context" - data required to execute an Infer Request. + * This includes graph (which handles the intermediate data) and arena/observer for the TBB */ +struct MultiWorkerTaskContext { + std::shared_ptr<MKLDNNGraph> ptrGraph; +}; + +#if defined(__APPLE__) || defined(_WIN32) +typedef void cpu_set_t; +#define CPU_FREE(cpuset) +// notice that functions below are just stubs for OSs other than Linux +#endif +/* Check whether any affinity-related env variables are set (relevant for the OpenMP) */ +bool check_env_variables(); +/* Get the cores affinity mask for the current process */ +bool get_process_mask(int& ncpus, cpu_set_t*& mask); +/* Pin current thread to a set of cores determined by the mask. */ +bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask); +/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask. + * The function can also handle the hyper-threading (by populating the physical cores first) */ +bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask); + +#if IE_THREAD == IE_THREAD_TBB +/* Simple observer that handles pinning threads to the cores, it serves as a callback for threads entering the arena. */ +class pinning_observer: public tbb::task_scheduler_observer { + cpu_set_t *mask; + int ncpus; + int stream_id, threads_per_stream; + const int pinning_step; + +public: + pinning_observer(tbb::task_arena& _arena, int _stream_id, int _threads_per_stream, int _pinning_step = 1) : + tbb::task_scheduler_observer(_arena), + stream_id(_stream_id), threads_per_stream(_threads_per_stream), pinning_step(_pinning_step) { + get_process_mask(ncpus, mask); + } + + void on_scheduler_entry(bool) override { + if (!mask) return; + int thread_idx = tbb::task_arena::current_thread_index(); + int thr_idx = stream_id * threads_per_stream + thread_idx; + // pin thread to the vacant slot + pin_thread_to_vacant_core(thr_idx, pinning_step, ncpus, mask); + } + + void on_scheduler_exit(bool) override { + if (!mask) return; + // reset the thread's mask (to the original process mask) + pin_current_thread_by_mask(ncpus, mask); + } + + ~pinning_observer() { + if (mask) + CPU_FREE(mask); + } +}; + +class auto_scope_observing { +public: + explicit auto_scope_observing(std::unique_ptr<tbb::task_scheduler_observer>& _p) : p(_p) { + if (p) + p->observe(true); + } + ~auto_scope_observing() { + if (p) + p->observe(false); + } + +protected: + std::unique_ptr<tbb::task_scheduler_observer>& p; +}; +#endif // IE_THREAD == IE_THREAD_TBB + +/* Class wrapping multiple worker threads that monitors the same queue with Infer Requests. */ +class MultiWorkerTaskExecutor : public ITaskExecutor { +public: + typedef std::shared_ptr<MultiWorkerTaskExecutor> Ptr; + + explicit MultiWorkerTaskExecutor(const std::vector<Task::Ptr>&, std::string name = "Default"); + + ~MultiWorkerTaskExecutor(); + + /** + * @brief Adds task for execution and notifies one of the working threads about the new task. + * @note can be called from multiple threads - tasks will be added to the queue and executed one-by-one in FIFO mode. + * @param task - shared pointer to the task + * @return true if succeed to add task, otherwise - false + */ + bool startTask(Task::Ptr task) override; + + static thread_local MultiWorkerTaskContext ptrContext; + +private: + std::vector<std::thread> _threads; + std::mutex _queueMutex; + std::condition_variable _queueCondVar; + std::queue<Task::Ptr> _taskQueue; + std::atomic<bool> _isStopped; + std::string _name; + std::atomic<int> _initCount; +}; + +/* Pure Infer Requests - just input and output data. */ +class MKLDNNGraphlessInferRequest : public InferenceEngine::InferRequestInternal { +public: + typedef std::shared_ptr<MKLDNNGraphlessInferRequest> Ptr; + explicit MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs, + InferenceEngine::OutputsDataMap networkOutputs); + + void InferImpl() override; + + void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override; + + /** + * @brief Given optional implementation of setting blob to avoid need for it to be implemented by plugin + * @param name - a name of input or output blob. + * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size. + */ + void SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) override; + + /** + * @brief Given optional implementation of getting blob to avoid need for it to be implemented by plugin + * @param name - a name of input or output blob. + * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size. + */ + void GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) override; + + + void SetBatch(int batch = -1) override; + +private: + int m_curBatch; + std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> m_perfMap; +}; + + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp index f48ada45e..d23b12e3b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -16,6 +15,7 @@ using namespace MKLDNNPlugin; using namespace InferenceEngine; using namespace InferenceEngine::details; +// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu caseless_map<std::string, std::function<void(GenericLayer*, mkldnn::algorithm&, float&, float&)>> MKLDNNActivationNode::initializers = { {"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) { alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h index 508d8c796..9dac1507c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp index 502a804d3..173df1c24 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp @@ -1,11 +1,11 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // #include "mkldnn_batchnorm_node.h" #include "mkldnn_depthwise_node.h" #include <mkldnn_extension_utils.h> +#include "ie_memcpy.h" using namespace mkldnn; using namespace MKLDNNPlugin; @@ -77,7 +77,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() { THROW_IE_EXCEPTION << "Cannot get weights blob for node " << getName() << "."; size_t weightsByteSize = blb->byteSize(); - memcpy(data, blb->buffer(), weightsByteSize); + ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize); data += blb->size(); blb = scshLayer->_biases; @@ -86,7 +86,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() { } else { if (weightsByteSize != blb->byteSize()) THROW_IE_EXCEPTION << "ScaleShift has incorrect weights!"; - memcpy(data, blb->buffer(), weightsByteSize); + ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize); } internalBlobs.push_back(internalBlob); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h index ef948b70f..c7d9d3e17 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp index 1da5d57f5..fd2893e95 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -59,16 +58,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - precision = getCnnLayer()->outData[0]->getPrecision(); + InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision(); + auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision); + InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision(); auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - if (getCnnLayer()->precision == Precision::I8) { - inputDataType = memory::data_type::u8; - outputDataType = memory::data_type::u8; - } - MKLDNNDims dstDims = getChildEdgeAt(0)->getDims(); InferenceEngine::LayerConfig config; config.dynBatchSupport = true; @@ -103,6 +97,16 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); } } + } else if (dims.ndims() == 5) { + if (dims[1] % 8 == 0) { + config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c)); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); + + if (dims[1] % 16 == 0) { + config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c)); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); + } + } } if (axis != 1 || hasEltwise) @@ -110,12 +114,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { auto numOfDim = static_cast<size_t>(dstDims.ndims()); - SizeVector order; - SizeVector offsets; + SizeVector order(numOfDim); + SizeVector offsets(numOfDim, 0lu); size_t offset = std::numeric_limits<size_t>::max(); for (size_t i = 0; i < numOfDim; i++) { - order.push_back(i); - offsets.push_back(0); + order[i] = i; } if (this->getCnnLayer()->precision == Precision::I8) { @@ -135,7 +138,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { strides[i] = std::numeric_limits<size_t>::max(); } - config.outConfs[0].desc = TensorDesc(Precision::U8, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides}); + config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(), + dstDims.ToSizeVector(), + { blkDims, order, offset, offsets, strides }); for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); @@ -144,7 +149,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NHWC in mkldnn - config.inConfs[i].desc = TensorDesc(Precision::U8, parentEdge->getDims().ToSizeVector(), + config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(), {blkDims, order, offset, offsets, strides}); } @@ -174,26 +179,30 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); - if (numOfDim == 4) { - order = {0, 1, 2, 3, 1}; - offsets = {0, 0, 0, 0, 0}; - numOfDim = 5; + if (numOfDim == 4lu || numOfDim == 5lu) { + size_t blkDimsLen = numOfDim + 1; + order.resize(blkDimsLen); + for (size_t i = 0; i < numOfDim; i++) { + order[i] = i; + } + order[numOfDim] = 1lu; + offsets = SizeVector(blkDimsLen, 0lu); - // nChw8c and nChw16c - for (int sizeS : {8, 16}) { + // nChw8c, nChw16c, nCdhw8c, nCdhw16c + for (size_t sizeS : {8lu, 16lu}) { SizeVector blkDims = dstDims.ToSizeVector(); if (blkDims[1] % sizeS) continue; - blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0); + blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); blkDims.push_back(sizeS); - strides.resize(numOfDim); - strides[numOfDim - 1] = 1; - for (size_t i = 2; i <= numOfDim; i++) { - if (numOfDim - i < axis) { - strides[numOfDim - i] = std::numeric_limits<size_t>::max(); + strides.resize(blkDimsLen); + strides[blkDimsLen - 1] = 1; + for (size_t i = 2lu; i <= blkDimsLen; i++) { + if (blkDimsLen - i < axis) { + strides[blkDimsLen - i] = std::numeric_limits<size_t>::max(); } else { - strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1]; + strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1]; } } config.outConfs[0].desc = TensorDesc( @@ -201,13 +210,13 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides}); bool canInplace = true; - for (size_t i = 0; canInplace && i < getParentEdges().size(); i++) { + for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); blkDims = parentEdge->getDims().ToSizeVector(); if (blkDims[1] % sizeS) canInplace = false; - blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0); + blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); blkDims.push_back(sizeS); config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(), {blkDims, order, offset, offsets, strides}); @@ -225,11 +234,6 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() { precision = getCnnLayer()->outData[0]->getPrecision(); auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - if (getCnnLayer()->precision == Precision::I8) { - inputDataType = memory::data_type::u8; - outputDataType = memory::data_type::u8; - } - bool hasUnknown = false; std::vector<size_t> canSelectPrimitive; for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h index 2b5fa898c..9aa51d7cd 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp index 109a87fe5..ea1aee821 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -37,18 +36,18 @@ MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& wScale = ws->second; } - // Trying to find oi-scale - lastInInt8Chain = true; - auto ois = layer->blobs.find("oi-scale"); - if (ois != layer->blobs.end()) { - // If we can find an o-scale, then the next layer has to be an INT8. - lastInInt8Chain = false; - oScale = ois->second; - } else { - // If we can't find an oi-scale then the next layer has to be - // an FP32, so we are the last layer in the INT8-chain - lastInInt8Chain = true; + if (getCnnLayer()->type == "Convolution" && getCnnLayer()->precision == Precision::I8) { + auto ois = layer->blobs.find("oi-scale"); + if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8) + && ois == layer->blobs.end()) { + THROW_IE_EXCEPTION << "Internal error of graph quantization - missmatch of intermediate scales and next layer type for convolution " + << getCnnLayer()->name; + } + if (ois != layer->blobs.end()) { + // If we can find an oi-scale, then the next layer has to be an INT8. + oScale = ois->second; + } } } @@ -99,6 +98,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { groupOC /= groupNum; } + weightDims.clear(); weightDims.push_back(groupOC); weightDims.push_back(groupIC); for (int i = 1; i <= convLayer->_kernel.size(); i++) { @@ -141,13 +141,13 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { dilation.push_back(static_cast<int>(convLayer->_dilation[convLayer->_dilation.size() - i]) - 1); } - auto allPads = getConvPaddings(*convLayer); + auto allPads = getPaddings(*convLayer); invertVectorCopyUtoI(allPads.begin, paddingL); invertVectorCopyUtoI(allPads.end, paddingR); MKLDNNDims weightsDims = MKLDNNDims(weightDims); - for (int i = 0; i < 2; i++) { + for (int i = 0; i < paddingR.size(); i++) { int with_group = (isGrouped || isMerged) ? 1 : 0; int krn = weightsDims[with_group + 2 + i]; int src = getParentEdgeAt(0)->getDims()[2 + i]; @@ -176,26 +176,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { } } - if (weights->precision() == Precision::I8) { - inputDataType = memory::u8; - if (lastInInt8Chain) { - outputDataType = memory::f32; - } else { - // Searching for the last fused node and taking the precision from there - Precision p = getCnnLayer()->precision; - if (fusedWith.size() > 0 && fusedWith[fusedWith.size() - 1]->getCnnLayer()->type == "ReLU") { - p = fusedWith[fusedWith.size() - 1]->getCnnLayer()->precision; - } - - if (p == Precision::I8) { - outputDataType = memory::s8; - } else if (p == Precision::U8) { - outputDataType = memory::u8; - } else { - THROW_IE_EXCEPTION << "Invalid layer precision for " << getName(); - } - } - + if (this->getCnnLayer()->precision == Precision::I8) { MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc); MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc); createDescriptor({in_candidate}, {out_candidate}); @@ -204,22 +185,48 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() { inputDataType = memory::f32; outputDataType = memory::f32; - MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::nchw); - MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::nchw); - createDescriptor({in_candidate}, {out_candidate}); + Layout layout = convLayer->input()->getLayout(); - if (IC == 3 || IC == 1) { - out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c); - createDescriptor({in_candidate}, {out_candidate}); - out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c); + if (layout == NCHW || layout == NHWC) { + MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, + layout == NCHW ? memory::nchw : memory::nhwc); + MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, + layout == NCHW ? memory::nchw : memory::nhwc); createDescriptor({in_candidate}, {out_candidate}); - } else { - in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c); - out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c); - createDescriptor({in_candidate}, {out_candidate}); - in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c); - out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c); + + if (IC == 3 || IC == 1) { + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c); + createDescriptor({in_candidate}, {out_candidate}); + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c); + createDescriptor({in_candidate}, {out_candidate}); + } else { + in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c); + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c); + createDescriptor({in_candidate}, {out_candidate}); + in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c); + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c); + createDescriptor({in_candidate}, {out_candidate}); + } + } else if (layout == NCDHW || layout == NDHWC) { + MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, + layout == NCDHW ? memory::ncdhw : memory::ndhwc); + MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, + layout == NCDHW ? memory::ncdhw : memory::ndhwc); createDescriptor({in_candidate}, {out_candidate}); + + if (IC == 3 || IC == 1) { + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c); + createDescriptor({in_candidate}, {out_candidate}); + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c); + createDescriptor({in_candidate}, {out_candidate}); + } else { + in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c); + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c); + createDescriptor({in_candidate}, {out_candidate}); + in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c); + out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c); + createDescriptor({in_candidate}, {out_candidate}); + } } } } @@ -231,7 +238,15 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe for (auto &node : fusedWith) { auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get()); if (eltwiseNode) { - ops.append_sum(1.0); + if (eltwiseNode->getCnnLayer()->precision == Precision::I8) { + auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale"); + if (it != eltwiseNode->getCnnLayer()->blobs.end()) { + // currently there is the only one scale while we need scale by channel :( + ops.append_sum(it->second->buffer().as<float*>()[0]); + } + } else { + ops.append_sum(1.0); + } continue; } @@ -252,11 +267,10 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine()))); PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x); - int bufferSize = depthwiseNode->isBroadcast() ? 1 : depthwiseDims[0]; PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, depthwiseLayer->_weights->buffer(), - bufferSize * MKLDNNExtensionUtils::sizeOfDataType( - memory::data_type::f32)); + depthwiseLayer->_weights->size() * + MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); if (depthwiseNode->isBroadcast()) { float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0]; @@ -271,9 +285,8 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe memory::format::x); PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, depthwiseLayer->_biases->buffer(), - bufferSize * - MKLDNNExtensionUtils::sizeOfDataType( - memory::data_type::f32)); + depthwiseLayer->_biases->size() * + MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32)); if (depthwiseNode->isBroadcast()) { float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0]; @@ -450,14 +463,15 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine:: bdt = memory::s32; Precision outPrec; - if (lastInInt8Chain) { + if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) { outPrec = Precision::FP32; } else { // define precision accordninly normalizer + // TODO(amalyshe) do we need to have separate flow for last in int8 chain or not? outPrec = outDesc.getPrecision(); } - inDesc = TensorDesc(Precision::U8, inputDesc[0].getDims(), inputDesc[0].getBlockingDesc()); + inDesc = TensorDesc(inDesc.getPrecision() , inputDesc[0].getDims(), inputDesc[0].getBlockingDesc()); outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), outputDesc[0].getBlockingDesc()); } @@ -502,8 +516,8 @@ void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) float* wScaleData = static_cast<float*>(wScale->buffer()); std::vector<float> oScaleDataVector; - if (!lastInInt8Chain) { - float* oScaleData = static_cast<float*>(oScale->buffer()); + if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) { + float *oScaleData = static_cast<float *>(oScale->buffer()); for (size_t c = 0; c < wScale->size(); c++) { oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h index aa2424194..19191ee45 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -57,8 +56,6 @@ private: InferenceEngine::ConvolutionLayer* convLayer; InferenceEngine::Blob::Ptr wScale, oScale; - - bool lastInInt8Chain; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp index aafa4aec0..8b11c296f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h index 2895e81e8..f74ab297e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp index 1295e05a5..38ca06ce8 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +10,7 @@ #include <vector> #include <mkldnn_types.h> #include <mkldnn_extension_utils.h> +#include <ie_layers_internal.hpp> #include "ie_parallel.hpp" using namespace mkldnn; @@ -67,18 +67,17 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() { deconvLayer->_group, deconvLayer->input()->getTensorDesc().getDims()[1] / deconvLayer->_group, deconvLayer->_out_depth / deconvLayer->_group, - deconvLayer->_kernel[Y_AXIS], - deconvLayer->_kernel[X_AXIS] }; groupNum = deconvLayer->_group; } else { weightDims = { deconvLayer->input()->getTensorDesc().getDims()[1], - deconvLayer->_out_depth, - deconvLayer->_kernel[Y_AXIS], - deconvLayer->_kernel[X_AXIS] + deconvLayer->_out_depth }; } + for (int i = 1; i <= deconvLayer->_kernel.size(); i++) { + weightDims.push_back(deconvLayer->_kernel[deconvLayer->_kernel.size() - i]); + } internalBlobs.push_back(createInternalBlob(weightDims, true)); @@ -86,12 +85,13 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() { for (int i = 1; i <= deconvLayer->_dilation.size(); i++) { dilation.push_back(static_cast<int>(deconvLayer->_dilation[deconvLayer->_dilation.size() - i]) - 1); } - invertVectorCopyUtoI(deconvLayer->_padding, paddingL); - invertVectorCopyUtoI(deconvLayer->_pads_end, paddingR); + auto allPads = getPaddings(*deconvLayer); + invertVectorCopyUtoI(allPads.begin, paddingL); + invertVectorCopyUtoI(allPads.end, paddingR); weightsDims = MKLDNNDims(weightDims); - for (int i = 0; i < 2; i++) { + for (int i = 0; i < paddingR.size(); i++) { int with_group = (withGroups) ? 1 : 0; int krn = weightsDims[with_group + 2 + i]; int src = getChildEdgeAt(0)->getDims()[2 + i]; @@ -115,28 +115,46 @@ void MKLDNNDeconvolutionNode::execute(mkldnn::stream strm) { } if (withBiases) { const auto *bias = biases->buffer().as<const float*>(); + auto biasSize = biases->size(); auto dst = getChildEdgeAt(0)->getBlob(); float *output = dst->buffer().as<float *>() + dst->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto dims_size = dst->getTensorDesc().getDims().size(); + auto layout = dst->layout(); const size_t N = dst->getTensorDesc().getDims()[0]; - const size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum; - const size_t H = dst->getTensorDesc().getDims()[2]; - const size_t W = dst->getTensorDesc().getDims()[3]; - const size_t blkC = - dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4 ? - dst->getTensorDesc().getBlockingDesc().getBlockDims()[4] : - 1; + size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum; + if (C < 1) C = 1; + const size_t D = dims_size > 4 ? dst->getTensorDesc().getDims()[dims_size - 3] : 1lu; + const size_t H = dst->getTensorDesc().getDims()[dims_size - 2]; + const size_t W = dst->getTensorDesc().getDims()[dims_size - 1]; + size_t blkC = 1lu; + if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5) { + blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5 ? + dst->getTensorDesc().getBlockingDesc().getBlockDims()[5] : + 1lu; + } else if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4) { + blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims()[4]; + } auto strides = dst->getTensorDesc().getBlockingDesc().getStrides(); + int output_size = strides[0] * N - dst->getTensorDesc().getBlockingDesc().getOffsetPadding(); - parallel_for4d(N, C, H, W, [&](size_t n, size_t c, size_t h, size_t w) { + parallel_for5d(N, C, D, H, W, [&](size_t n, size_t c, size_t d, size_t h, size_t w) { for (size_t g = 0; g < groupNum; g++) { - const size_t off = n * strides[0] + (g * C + c) * strides[1] + h * strides[2] + w * strides[3]; + const size_t off = n * strides[0] + + (g * C + c) * strides[1] + + d * strides[dims_size - 3] + + h * strides[dims_size - 2] + + w * strides[dims_size - 1]; + if (off >= output_size) continue; auto o = &output[off]; + int gcb = g * C * blkC + c * blkC; for (int bc = 0; bc < blkC; ++bc) { - o[bc] += bias[c * blkC + bc]; + int index = gcb + bc; + if (index < biasSize) + o[bc] += bias[index]; } } }); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h index 244054c26..e32a66a73 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp index 8eadcf824..6b1097a62 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -39,9 +38,20 @@ void MKLDNNDepthwiseNode::getSupportedDescriptors() { SizeVector weightDims = { (long unsigned int)parentOutDims[1] }; MKLDNNDims blocked_weightDims(weightDims); + auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get()); + if (wLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << "."; + + InferenceEngine::Blob::Ptr blb = wLayer->_weights; + if (blb) + realWeightSize = blb->size(); internalBlobs.push_back(createInternalBlob(weightDims, true)); - if (isWithBiases()) + if (isWithBiases()) { + InferenceEngine::Blob::Ptr blb = wLayer->_biases; + if (blb) + realBiasSize = blb->size(); internalBlobs.push_back(createInternalBlob(weightDims, false)); + } for (auto format : getAvailableFormatsForDims(parentOutDims)) { MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format}; @@ -66,13 +76,15 @@ void MKLDNNDepthwiseNode::createPrimitive() { if (isBroadcast()) { float broadcastValue = static_cast<float*>(internalBlobMemory[0]->GetData())[0]; - for (int i = 1; i < internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { + int blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; + for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) { static_cast<float*>(internalBlobMemory[0]->GetData())[i] = broadcastValue; } if (isWithBiases()) { + blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; broadcastValue = static_cast<float*>(internalBlobMemory[1]->GetData())[0]; - for (int i = 1; i < internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) { + for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) { static_cast<float*>(internalBlobMemory[1]->GetData())[i] = broadcastValue; } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h index 78ef529f5..16bd3a505 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -50,6 +49,8 @@ private: static Register<MKLDNNDepthwiseNode> reg; mkldnn::algorithm algorithm; + size_t realWeightSize = 0; + size_t realBiasSize = 0; bool withBiases; bool broadcast; }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 0a051dc52..111196817 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -99,15 +98,9 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() { mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32); supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, format)); } else { - THROW_IE_EXCEPTION << "Invalid Eltwise layer precision"; + THROW_IE_EXCEPTION << "Invalid Eltwise layer precision: " << getCnnLayer()->name; } } - - if (getCnnLayer()->precision == Precision::I8) { - mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8); - mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8); - supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, mkldnn::memory::format::nhwc)); - } } void MKLDNNEltwiseNode::createPrimitive() { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index e206799f4..0395cd432 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp index 20b60c62c..75b814e81 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -60,8 +59,11 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() { } else if (inDims.ndims() == 4) { weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]), static_cast<size_t>(inDims[3])}; + } else if (inDims.ndims() == 5) { + weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]), + static_cast<size_t>(inDims[3]), static_cast<size_t>(inDims[4])}; } else { - THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 4 or 2, got: " + THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4 or 2, got: " << inDims.ndims() << " dims."; } @@ -113,10 +115,16 @@ memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::forma return memory::format::oi; case memory::format::nchw: return memory::format::oihw; + case memory::format::ncdhw: + return memory::format::oidhw; case memory::format::nChw8c: return memory::format::oIhw8i; + case memory::format::nCdhw8c: + return memory::format::oIdhw8i; case memory::format::nChw16c: return memory::format::oIhw16i; + case memory::format::nCdhw16c: + return memory::format::oIdhw16i; default: THROW_IE_EXCEPTION << "Unsupported source format for node " << getName(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h index 88259a265..73c06f7ce 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp new file mode 100644 index 000000000..2874d9dfe --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp @@ -0,0 +1,234 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_gemm_node.h" +#include <ie_layers.h> +#include <string> +#include <vector> +#include <memory> +#include <algorithm> +#include <cmath> +#include <mkldnn_types.h> +#include <mkldnn_extension_utils.h> + +using namespace mkldnn; +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +MKLDNNGemmNode::MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {} + +void MKLDNNGemmNode::getSupportedDescriptors() { + auto* gemmLayer = dynamic_cast<GemmLayer*>(getCnnLayer().get()); + + if (gemmLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert gemm layer."; + + if (getParentEdges().size() != 2 && getParentEdges().size() != 3) + THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName(); + if (getChildEdges().size() != 1) + THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName(); + + auto inDims0 = getParentEdgeAt(0)->getDims(); + auto inDims1 = getParentEdgeAt(1)->getDims(); + auto outDims = getChildEdgeAt(0)->getDims(); + + alpha = gemmLayer->alpha; + beta = gemmLayer->beta; + transposeA = gemmLayer->transpose_a; + transposeB = gemmLayer->transpose_b; + + if ((inDims0.ndims() < 2 || inDims0.ndims() > 4) || + (inDims1.ndims() < 2 || inDims1.ndims() > 4)) + THROW_IE_EXCEPTION << "Unsupported input dims count for layer " << getName(); + + if (outDims.ndims() < 2 || outDims.ndims() > 4) + THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName(); + + if (inDims0.ndims() != inDims1.ndims() || inDims0.ndims() != outDims.ndims()) + THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName(); + + int nDims = inDims0.ndims(); + xAxis = nDims - 1; + yAxis = nDims - 2; + + if (inDims0[xAxis] != inDims1[yAxis] || inDims0[yAxis] != outDims[yAxis] || inDims1[xAxis] != outDims[xAxis]) + THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName(); + + isThreeInputs = getParentEdges().size() == 3; + + if (isThreeInputs) { + auto inDims2 = getParentEdgeAt(2)->getDims(); + + if (inDims2.ndims() < 2 || inDims2.ndims() > 4) + THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName(); + + if (inDims2.ndims() != outDims.ndims()) + THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName(); + + if (inDims2[yAxis] != outDims[yAxis] || inDims2[xAxis] != outDims[xAxis]) + THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName(); + } + + for (int dim_idx = nDims - 3; dim_idx >= 0; dim_idx--) { + if (isThreeInputs) { + auto inDims2 = getParentEdgeAt(2)->getDims(); + + if (inDims2[dim_idx] != outDims[dim_idx] && inDims2[dim_idx] != 1) + THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName(); + + int cOffset = 1; + for (int i = dim_idx + 1; i < nDims; i++) + cOffset *= inDims2[i]; + cOffsets.push_back(inDims2[dim_idx] == outDims[dim_idx] ? cOffset : 0); + } + + if ((inDims0[dim_idx] != outDims[dim_idx] && inDims0[dim_idx] != 1) || + (inDims1[dim_idx] != outDims[dim_idx] && inDims1[dim_idx] != 1)) { + THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName(); + } + + int aOffset = 1; + for (int i = dim_idx + 1; i < nDims; i++) + aOffset *= inDims0[i]; + aOffsets.push_back(inDims0[dim_idx] == outDims[dim_idx] ? aOffset : 0); + + int bOffset = 1; + for (int i = dim_idx + 1; i < nDims; i++) + bOffset *= inDims1[i]; + bOffsets.push_back(inDims1[dim_idx] == outDims[dim_idx] ? bOffset : 0); + } + + for (unsigned long dim_idx = aOffsets.size(); dim_idx < 2; dim_idx++) + aOffsets.push_back(0); + for (unsigned long dim_idx = bOffsets.size(); dim_idx < 2; dim_idx++) + bOffsets.push_back(0); + for (unsigned long dim_idx = cOffsets.size(); dim_idx < 2; dim_idx++) + cOffsets.push_back(0); +} + +void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); + auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32); + + auto same = [&] (memory::format fmt) -> PrimitiveDescInfo { + InferenceEngine::LayerConfig config; + config.dynBatchSupport = true; + for (size_t i = 0; i < getParentEdges().size(); i++) { + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, fmt); + config.inConfs.push_back(dataConfig); + } + + InferenceEngine::DataConfig dataConfig; + dataConfig.inPlace = -1; + dataConfig.constant = false; + dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt); + config.outConfs.push_back(dataConfig); + return {config, impl_desc_type::gemm_any}; + }; + + supportedPrimitiveDescriptors.push_back(same(memory::any)); +} + +void MKLDNNGemmNode::createPrimitive() { + auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); + auto& src0MemPtr = getParentEdgeAt(0)->getMemoryPtr(); + auto& src1MemPtr = getParentEdgeAt(1)->getMemoryPtr(); + if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Destination memory isn't allocated."; + if (!src0MemPtr || !src0MemPtr->GetPrimitivePtr() || !src1MemPtr || !src1MemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Input memory isn't allocated."; + if (getSelectedPrimitiveDescriptor() == nullptr) + THROW_IE_EXCEPTION << "Preferable primitive descriptor isn't set."; + + if (isThreeInputs) { + auto& src2MemPtr = getParentEdgeAt(2)->getMemoryPtr(); + if (!src2MemPtr || !src2MemPtr->GetPrimitivePtr()) + THROW_IE_EXCEPTION << "Input memory isn't allocated."; + } +} + +void MKLDNNGemmNode::execute(mkldnn::stream strm) { + auto inDims0 = getParentEdgeAt(0)->getDims(); + auto inDims1 = getParentEdgeAt(1)->getDims(); + auto outDims = getChildEdgeAt(0)->getDims(); + + auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); + auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); + const float *src0_ptr = reinterpret_cast<const float*>(srcMemory0.GetData()) + + srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding; + const float *src1_ptr = reinterpret_cast<const float*>(srcMemory1.GetData()) + + srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding; + float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) + + getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + + int MB1 = outDims.ndims() == 4 ? batchToProcess() : 1; + int MB2 = outDims.ndims() == 3 ? batchToProcess() : outDims.ndims() > 3 ? outDims[outDims.ndims() - 3] : 1; + int M = inDims0[yAxis]; + int N = inDims1[xAxis]; + int K = inDims0[xAxis]; + + const char transa = transposeA ? 'T' : 'N'; + const char transb = transposeB ? 'T' : 'N'; + + int lda = transposeA ? M : K; + int ldb = transposeB ? K : N; + int ldc = N; + + const float *src2_ptr; + if (isThreeInputs) { + auto& srcMemory2 = getParentEdgeAt(2)->getMemory(); + src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetData()) + + srcMemory2.GetDescriptor().data.layout_desc.blocking.offset_padding; + } else { + src2_ptr = dst_ptr; + } + + if (!isThreeInputs) { + beta = 0.f; + } + + for (int b1 = 0; b1 < MB1; b1++) { + const float *a_ptr = src0_ptr; + const float *b_ptr = src1_ptr; + const float *c_ptr = src2_ptr; + float *d_ptr = dst_ptr; + + for (int b2 = 0; b2 < MB2; b2++) { + if (isThreeInputs) { + memcpy(d_ptr, c_ptr, M * N * sizeof(float)); + c_ptr += cOffsets[0]; + } + + mkldnn_sgemm(&transb, &transa, &N, &M, &K, &alpha, b_ptr, &ldb, a_ptr, &lda, &beta, d_ptr, &ldc); + + a_ptr += aOffsets[0]; + b_ptr += bOffsets[0]; + d_ptr += M * N; + } + + src0_ptr += aOffsets[1]; + src1_ptr += bOffsets[1]; + dst_ptr += MB2 * M * N; + + if (isThreeInputs) { + src2_ptr += cOffsets[1]; + } + } +} + +bool MKLDNNGemmNode::created() const { + return getType() == Gemm; +} + +int MKLDNNGemmNode::getMaxBatch() { + if (!outDims.empty()) + return outDims[0][0]; + return 0; +} diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h new file mode 100644 index 000000000..da171a0da --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h @@ -0,0 +1,44 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include <ie_common.h> +#include <mkldnn_node.h> +#include <string> +#include <vector> + +namespace MKLDNNPlugin { + +class MKLDNNGemmNode : public MKLDNNNode { +public: + MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng); + ~MKLDNNGemmNode() override = default; + + void getSupportedDescriptors() override; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override; + void execute(mkldnn::stream strm) override; + bool created() const override; + int getMaxBatch() override; + +private: + static Register<MKLDNNGemmNode> reg; + float alpha; + float beta; + bool transposeA; + bool transposeB; + + int xAxis; + int yAxis; + + bool isThreeInputs; + + std::vector<int> aOffsets; + std::vector<int> bOffsets; + std::vector<int> cOffsets; +}; + +} // namespace MKLDNNPlugin + diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp index 04cb400e1..b31b491e1 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -83,8 +82,7 @@ bool MKLDNNGenericNode::created(const MKLDNNExtensionManager::Ptr &extMgr) { if (getCnnLayer() && extMgr) { // We should save extension manager in otder to avoid situation when // it will destroyed before extensibility primitives - extensionManager = extMgr; - extFactory.reset(extensionManager->CreateExtensionFactory(getCnnLayer())); + extFactory.reset(extMgr->CreateExtensionFactory(getCnnLayer())); if (extFactory) setType(Generic); @@ -147,11 +145,6 @@ void MKLDNNGenericNode::execLayer() { } } -MKLDNNGenericNode::~MKLDNNGenericNode() { - extFactory.reset(); - extensionManager.reset(); -} - void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &config) { InferenceEngine::LayerConfig rightConfig = config; InferenceEngine::StatusCode rc; @@ -206,11 +199,3 @@ void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &confi constant = ConstantType::Const; } } - -void MKLDNNGenericNode::initOptimalPrimitiveDescriptor() { - auto descriptor = getSelectedPrimitiveDescriptor(); - if (descriptor != nullptr) { - auto config = descriptor->getConfig(); - initDescriptor(config); - } -} diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h index 5cc8b0014..7bdd4a0f3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -17,7 +16,7 @@ namespace MKLDNNPlugin { class MKLDNNGenericNode : public MKLDNNNode { public: MKLDNNGenericNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng); - ~MKLDNNGenericNode() override; + ~MKLDNNGenericNode() = default; void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -30,7 +29,6 @@ public: } void initDescriptor(const InferenceEngine::LayerConfig& config) override; - void initOptimalPrimitiveDescriptor() override; void execLayer(); void cleanup() override; @@ -42,7 +40,6 @@ protected: private: static Register<MKLDNNGenericNode> reg; - MKLDNNExtensionManager::Ptr extensionManager; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp index 0a17a1442..9b42bee6b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h index 134ce8f61..99b4c8657 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp index 32594e315..4b1192b85 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h index b2a5c1829..9d85dabd3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp index b60177cb2..a37a2530b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp index 53ab16c39..ebc67748f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp index aa395a130..c23ce6ee5 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -69,6 +68,21 @@ void MKLDNNPermuteNode::initSupportedPrimitiveDescriptors() { config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c); supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown}); } + } else if (getParentEdgeAt(0)->getDims().ndims() == 5) { + config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ncdhw); + config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ncdhw); + supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown}); + + auto srcDims = getParentEdgeAt(0)->getDims(); + if (srcDims[1] % 8 == 0) { + config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c); + supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown}); + } + + if (srcDims[1] % 16 == 0) { + config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c); + supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown}); + } } else { config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::any); config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, @@ -221,6 +235,70 @@ static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& } } +static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { + auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData()); + auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData()); + src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding; + dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding; + + const int C = srcMemPtr->GetDims()[1]; + const int S = srcMemPtr->GetDims()[2]; + + parallel_for2d(MB, S, [&](int n, int s) { + int src_off = 0; + int dst_off = 0; + + for (int c = 0; c < C; c++) { + src_off = n * C * S + + c * S + + s; + dst_off = n * S * C + + s * C + + c; + + dst_data[dst_off] = src_data[src_off]; + } + }); +} + +static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { + auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData()); + auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData()); + src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding; + dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding; + + const int DIM1 = srcMemPtr->GetDims()[1]; + const int DIM2 = srcMemPtr->GetDims()[2]; + const int DIM3 = srcMemPtr->GetDims()[3]; + const int DIM4 = srcMemPtr->GetDims()[4]; + const int DIM5 = srcMemPtr->GetDims()[5]; + + int src_off = 0; + int dst_off = 0; + + for (int n = 0; n < MB; n++) { + for (int dim3 = 0; dim3 < DIM3; dim3++) { + for (int dim4 = 0; dim4 < DIM4; dim4++) { + for (int dim1 = 0; dim1 < DIM1; dim1++) { + for (int dim5 = 0; dim5 < DIM5; dim5++) { + for (int dim2 = 0; dim2 < DIM2; dim2++) { + src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 + + dim1 * DIM2 * DIM3 * DIM4 * DIM5 + + dim2 * DIM3 * DIM4 * DIM5 + + dim3 * DIM4 * DIM5 + + dim4 * DIM5 + + dim5; + + dst_data[dst_off] = src_data[src_off]; + dst_off++; + } + } + } + } + } + } +} + std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPermuteNode::OptimizedCases = { {{0, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_0231, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { return true; @@ -237,6 +315,12 @@ std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPerm {{0, 2, 1, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_0213, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()); })}, // shufflenet + {{0, 2, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_021, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { + return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()); + })}, // self attention block + {{0, 3, 4, 1, 5, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_034152, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) { + return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()); + })}, // learning-to-see-in-the-dark-sony }; void MKLDNNPermuteNode::execute(mkldnn::stream strm) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h index 5b69b4475..9c0ce0d49 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp index 0ec7c0a26..82e3eac50 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +10,7 @@ #include <vector> #include <mkldnn_types.h> #include <mkldnn_extension_utils.h> +#include <ie_layers_internal.hpp> using namespace mkldnn; using namespace MKLDNNPlugin; @@ -23,12 +23,8 @@ void MKLDNNPoolingNode::getSupportedDescriptors() { return; InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); precision = getCnnLayer()->outData[0]->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); auto * poolingLayer = dynamic_cast<PoolingLayer*>(getCnnLayer().get()); @@ -45,15 +41,16 @@ void MKLDNNPoolingNode::getSupportedDescriptors() { invertVectorCopyUtoI(poolingLayer->_stride, stride); invertVectorCopyUtoI(poolingLayer->_kernel, kernel); - invertVectorCopyUtoI(poolingLayer->_padding, paddingL); - invertVectorCopyUtoI(poolingLayer->_pads_end, paddingR); + auto allPads = getPaddings(*poolingLayer); + invertVectorCopyUtoI(allPads.begin, paddingL); + invertVectorCopyUtoI(allPads.end, paddingR); auto parentDims = getParentEdgeAt(0)->getDims(); auto childDims = getChildEdgeAt(0)->getDims(); if ((parentDims.ndims() < 4) || (parentDims.ndims() > 5)) THROW_IE_EXCEPTION << "Pooling layer. Unsupported mode. Only 4D and 5D blobs are supported as input."; - for (int i = 0; i < 2; i++) { + for (int i = 0; i < paddingR.size(); i++) { int krn = kernel[i]; int src = getParentEdgeAt(0)->getDims()[2 + i]; int dst = getChildEdgeAt(0)->getDims()[2 + i]; @@ -61,11 +58,11 @@ void MKLDNNPoolingNode::getSupportedDescriptors() { int calc_dst = (src - krn + paddingL[i]) / stride[i] + 1; paddingR[i] = (dst - calc_dst) * stride[i]; } - if (this->getCnnLayer()->precision == Precision::I8) { - MKLDNNMemoryDesc in_candidate{parentDims, memory::data_type::u8, memory::format::nhwc}; - MKLDNNMemoryDesc out_candidate{childDims, memory::data_type::u8, memory::format::nhwc}; - createDescriptor({in_candidate}, {out_candidate}); + // i8 layers supports only nhwc layout + MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, memory::format::nhwc}; + MKLDNNMemoryDesc out_candidate{childDims, outputDataType, memory::format::nhwc}; + createDescriptor({ in_candidate }, { out_candidate }); } else { // It doesn't support any format for (auto format : getAvailableFormatsForDims(parentDims)) { @@ -97,7 +94,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens algorithm alg; if (type == PoolingLayer::PoolType::AVG) { - if (!exclude_pad && (paddingL[0] != 0 || paddingL[1] != 0)) + bool not_zero_l = false; + for (auto lr : paddingL) { + if (lr) { + not_zero_l = true; + break; + } + } + if (!exclude_pad && not_zero_l) alg = pooling_avg_include_padding; else alg = pooling_avg_exclude_padding; @@ -114,7 +118,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens stride, kernel, paddingL, paddingR, mkldnn::padding_kind::zero)); - if (alg == pooling_avg_include_padding && (paddingR[0] || paddingR[1])) { + bool not_zero_r = false; + for (auto pr : paddingR) { + if (pr) { + not_zero_r = true; + break; + } + } + if (alg == pooling_avg_include_padding && not_zero_r) { // In case of AVG including paddings the norm coeff should be calculated // with tacking into account original pads. So we need to restore // original values (R_padding = L_padding). diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h index 0af8a8ae5..e5309f494 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp index 360f3459c..01ae0e6fd 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h index 370d694d9..a6fce5cbd 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp index 3b1678079..345b21536 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -71,6 +70,17 @@ void MKLDNNReorderNode::createPrimitive() { if (getSelectedPrimitiveDescriptor() == nullptr) THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set."; + createReorderPrimitive(srcMemPtr->GetDescriptor(), srcMemPtr->GetPrimitive().get_data_handle(), + dstMemPtr->GetDescriptor(), dstMemPtr->GetPrimitive().get_data_handle()); +} + +void MKLDNNReorderNode::createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr) { + src_blocked = std::make_shared<MKLDNNMemory>(getEngine()); + src_blocked->Create(srcDesc, srcPtr); + + dst_blocked = std::make_shared<MKLDNNMemory>(getEngine()); + dst_blocked->Create(dstDesc, dstPtr); + mkldnn::primitive_attr attr; if (_scales) { @@ -90,52 +100,12 @@ void MKLDNNReorderNode::createPrimitive() { attr.set_int_output_round_mode(round_nearest); } - if (srcMemPtr->GetSize() == dstMemPtr->GetSize()) { - InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision(); - InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision(); - - if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) { - // This reorder actually does nothing so we declare it in-place. - dstMemPtr->GetPrimitive().set_data_handle(srcMemPtr->GetPrimitive().get_data_handle()); - } else { - try { - // No autoblocking. Reorder can be applied as is - - reorder::primitive_desc pd = reorder::primitive_desc(srcMemPtr->GetPrimitiveDescriptor(), dstMemPtr->GetPrimitiveDescriptor(), attr); - prim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), dstMemPtr->GetPrimitive())); - } catch (...) {} - } - } else { - // Autoblocking case. nchw<=>nChw8c are only supported, but memory descriptor - // should be with strides. Prepare it from enlarged blob - memory::dims dims = srcMemPtr->GetDims(); - memory::dims dims_dst = dstMemPtr->GetDims(); - - for (int i = 0; i < dims.size(); i++) // min dims is a logical dims - dims[i] = std::min(dims[i], dims_dst[i]); - - memory::desc src_d = srcMemPtr->GetDescriptor(); - void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle(); - - memory::desc dst_d = dstMemPtr->GetDescriptor(); - void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle(); - - for (int i = 0; i < dims.size(); i++) - src_d.data.dims[i] = dst_d.data.dims[i] = dims[i]; - - src_blocked = std::make_shared<MKLDNNMemory>(getEngine()); - src_blocked->Create(src_d, src_data_hdl); - - dst_blocked = std::make_shared<MKLDNNMemory>(getEngine()); - dst_blocked->Create(dst_d, dst_data_hdl); - - // output blob should be zeroed. NaN value can occur in untouched place. - dstMemPtr->FillZero(); - + try { + // No autoblocking. Reorder can be applied as is reorder::primitive_desc pd = reorder::primitive_desc(src_blocked->GetPrimitiveDescriptor(), dst_blocked->GetPrimitiveDescriptor(), attr); prim.reset(new mkldnn::reorder(pd, src_blocked->GetPrimitive(), dst_blocked->GetPrimitive())); - } + } catch (...) {} } const std::vector<impl_desc_type>& MKLDNNReorderNode::getPrimitivesPriority() { @@ -148,32 +118,9 @@ bool MKLDNNReorderNode::created() const { } void MKLDNNReorderNode::execute(mkldnn::stream strm) { - if (prim) { - if (src_blocked) - src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle()); - if (dst_blocked) - dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle()); - MKLDNNNode::execute(strm); - } else { - InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision(); - InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision(); - if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) { - // Do nothing here - } else { - auto srcBlbPtr = getParentEdgeAt(0)->getBlob(); - auto dstBlbPtr = getChildEdgeAt(0)->getBlob(); - - assert(srcBlbPtr->size() == dstBlbPtr->size()); - int data_size = srcBlbPtr->size(); - - const auto* src_data = srcBlbPtr->cbuffer().as<const float *>(); - auto* dst_data = dstBlbPtr->buffer().as<float *>(); - - InferenceEngine::parallel_for(data_size, [&](int i) { - dst_data[dstBlbPtr->getTensorDesc().offset(i)] = src_data[srcBlbPtr->getTensorDesc().offset(i)]; - }); - } - } + src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle()); + dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle()); + MKLDNNNode::execute(strm); } void MKLDNNReorderNode::setDynamicBatchLim(int lim) { @@ -186,21 +133,12 @@ void MKLDNNReorderNode::setDynamicBatchLim(int lim) { void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle(); void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle(); - if (src_blocked && dst_blocked) { - src_d = src_blocked->GetDescriptor(); - dst_d = dst_blocked->GetDescriptor(); - src_data_hdl = src_blocked->GetPrimitive().get_data_handle(); - dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle(); - } - src_blocked = std::make_shared<MKLDNNMemory>(getEngine()); src_d.data.dims[0] = batchToProcess(); src_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess(); - src_blocked->Create(src_d, src_data_hdl); - dst_blocked = std::make_shared<MKLDNNMemory>(getEngine()); dst_d.data.dims[0] = batchToProcess(); dst_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess(); - dst_blocked->Create(dst_d, dst_data_hdl); - prim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), dst_blocked->GetPrimitive())); + + createReorderPrimitive(src_d, src_data_hdl, dst_d, dst_data_hdl); } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h index 3d74c2000..7a228ecec 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -51,6 +50,8 @@ private: MKLDNNMemoryPtr dst_blocked; MKLDNNMemoryPtr src_blocked; + + void createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr); }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp index cfd51bf36..d959aa5f9 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -49,15 +48,6 @@ void MKLDNNReshapeNode::initSupportedPrimitiveDescriptors() { config.outConfs[0].constant = false; config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); - if (inDims.ndims() == 4 && inDims[1] % 8 == 0 && outDims.ndims() == 4 &&outDims[1] % 8 == 0) { - outFormat = memory::format::any; - } - config.inConfs[0].inPlace = -1; - config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any); - config.outConfs[0].inPlace = -1; - config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat); - - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); } void MKLDNNReshapeNode::createPrimitive() { @@ -69,107 +59,6 @@ void MKLDNNReshapeNode::createPrimitive() { THROW_IE_EXCEPTION << "Input memory didn't allocate."; if (getSelectedPrimitiveDescriptor() == nullptr) THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set."; - - if (srcMemPtr->GetData() != dstMemPtr->GetData()) { - InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; - auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - precision = getCnnLayer()->outData[0]->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; - auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); - - auto dims = getParentEdgeAt(0)->getDims(); - - srcMem.reset(new MKLDNNMemory(getEngine())); - srcMem->Create(dims, inputDataType, MKLDNNMemory::GetPlainFormat(dims)); - - dstMem.reset(new MKLDNNMemory(getEngine())); - dstMem->Create(getChildEdgeAt(0)->getDims(), outputDataType, - MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()), srcMem->GetData()); - - if (srcMemPtr->GetSize() == srcMem->GetSize()) { - srcPrim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), srcMem->GetPrimitive())); - } else { - // Autoblocking mode - memory::dims dims = srcMem->GetDims(); // contains logical dims - - memory::desc src_d = srcMemPtr->GetDescriptor(); - void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle(); - - for (int i = 0; i < dims.size(); i++) - src_d.data.dims[i] = dims[i]; - - memory::primitive_desc tmp_src_pd(src_d, getEngine()); - src_blocked = std::make_shared<MKLDNNMemory>(getEngine()); - src_blocked->Create(src_d, src_data_hdl); - - srcPrim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), srcMem->GetPrimitive())); - } - - if (dstMemPtr->GetSize() == dstMem->GetSize()) { - dstPrim.reset(new mkldnn::reorder(dstMem->GetPrimitive(), dstMemPtr->GetPrimitive())); - } else { - // Autoblocking mode - memory::dims dims = srcMem->GetDims(); - - memory::desc dst_d = dstMemPtr->GetDescriptor(); - void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle(); - - for (int i = 0; i < dims.size(); i++) - dst_d.data.dims[i] = dims[i]; - - dst_blocked = std::make_shared<MKLDNNMemory>(getEngine()); - dst_blocked->Create(dst_d, dst_data_hdl); - - dstPrim.reset(new mkldnn::reorder(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive())); - } - } -} - -void MKLDNNReshapeNode::setDynamicBatchLim(int lim) { - dynBatchLim = lim; - if (srcPrim && dstPrim) { - auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); - auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); - memory::desc src_d = srcMemPtr->GetDescriptor(); - memory::desc dst_d = dstMemPtr->GetDescriptor(); - void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle(); - void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle(); - srcMem = std::make_shared<MKLDNNMemory>(getEngine()); - src_d.data.dims[0] = batchToProcess(); - srcMem->Create(src_d, src_data_hdl); - dstMemPtr = std::make_shared<MKLDNNMemory>(getEngine()); - src_d.data.dims[0] = batchToProcess(); - dstMemPtr->Create(src_d, src_data_hdl); - - if (src_blocked && dst_blocked) { - src_d = src_blocked->GetDescriptor(); - dst_d = dst_blocked->GetDescriptor(); - src_data_hdl = src_blocked->GetPrimitive().get_data_handle(); - dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle(); - } - src_blocked = std::make_shared<MKLDNNMemory>(getEngine()); - src_d.data.dims[0] = batchToProcess(); - src_blocked->Create(src_d, src_data_hdl); - - dst_blocked = std::make_shared<MKLDNNMemory>(getEngine()); - dst_d.data.dims[0] = batchToProcess(); - dst_blocked->Create(dst_d, dst_data_hdl); - srcPrim = std::make_shared<mkldnn::reorder>(src_blocked->GetPrimitive(), srcMem->GetPrimitive()); - dstPrim = std::make_shared<mkldnn::reorder>(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive()); - } -} - -void MKLDNNReshapeNode::execute(mkldnn::stream strm) { - if (srcPrim && dstPrim) { - if (src_blocked) - src_blocked->GetPrimitive().set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle()); - if (dst_blocked) - dst_blocked->GetPrimitive().set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle()); - strm.submit({*srcPrim, *dstPrim}); - } } bool MKLDNNReshapeNode::created() const { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h index eeb666008..bb30099c9 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -21,19 +20,10 @@ public: void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; - void execute(mkldnn::stream strm) override; bool created() const override; - void setDynamicBatchLim(int lim) override; private: static Register<MKLDNNReshapeNode> reg; - std::shared_ptr<mkldnn::primitive> srcPrim; - std::shared_ptr<mkldnn::primitive> dstPrim; - MKLDNNMemoryPtr srcMem; - MKLDNNMemoryPtr dstMem; - - MKLDNNMemoryPtr dst_blocked; - MKLDNNMemoryPtr src_blocked; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp index a474ca926..ba3228543 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp @@ -1,12 +1,11 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // #include "mkldnn_rnn.h" #include "mkldnn_extension_utils.h" #include "desc_iterator.hpp" -#include <ie_layers.h> +#include <ie_layers_prv.h> #include <string> #include <utility> @@ -16,39 +15,143 @@ using namespace InferenceEngine; namespace MKLDNNPlugin { -MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {} +template <typename T, typename P> +inline bool one_of(T val, P item) { return val == item; } +template <typename T, typename P, typename... Args> +inline bool one_of(T val, P item, Args... item_others) { + return val == item || one_of(val, item_others...); +} + +rnn_direction ie2mkl(RNNLayer::Direction &direction) { + return direction == RNNLayer::RNN_FWD ? unidirectional_left2right + : direction == RNNLayer::RNN_BWD ? unidirectional_right2left + : direction == RNNLayer::RNN_BDR ? bidirectional_concat + : unidirectional; +} + +MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) { + is_cell = layer->type == "LSTMCell"; +} bool MKLDNNRNN::created() const { - return getType() == RNN; + return getType() == (is_cell ? LSTMCell : RNN); } void MKLDNNRNN::getSupportedDescriptors() { + if (is_cell) + fillCellDesc(); + else + fillSeqDesc(); +} + +void MKLDNNRNN::fillCellDesc() { + if (!descs.empty()) return; + auto cellLayer = std::dynamic_pointer_cast<InferenceEngine::LSTMCell>(getCnnLayer()); + + if (!cellLayer) + THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer."; + + auto &ins = cellLayer->insData; + auto &outs = cellLayer->outData; + + if (ins.size() != 3) + THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName(); + if (outs.size() != 2) + THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName(); + + auto in_data_dims = getParentEdgeAt(0)->getDims(); + auto in_h_state_dims = getParentEdgeAt(1)->getDims(); + auto in_c_state_dims = getParentEdgeAt(2)->getDims(); + + auto out_h_state_dims = getChildEdgeAt(0)->getDims(); + auto out_c_state_dims = getChildEdgeAt(1)->getDims(); + + if (in_data_dims.ndims() != 2 + || in_h_state_dims.ndims() != 2 + || in_c_state_dims.ndims() != 2 + || out_h_state_dims.ndims() != 2 + || out_c_state_dims.ndims() != 2) + THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); + + T = 1; + N = in_data_dims[0]; + DC = in_data_dims[1]; + SC = in_h_state_dims[1]; + + // Expected shapes + MKLDNNDims D_shape {N, DC}, S_shape {N, SC}; + + if (in_data_dims != D_shape + || in_h_state_dims != S_shape + || in_c_state_dims != S_shape + || out_h_state_dims != S_shape + || out_c_state_dims != S_shape) + THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); + + auto blobs = cellLayer->blobs; + Blob::Ptr weights, bias; + if (blobs.find("weights") != blobs.end()) weights = blobs["weights"]; + if (blobs.find("biases") != blobs.end()) bias = blobs["biases"]; + + if (!weights) + THROW_IE_EXCEPTION << "RNN Layer. Weights do not present."; + + if (weights->size() != G*SC*(SC+DC)) + THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC); + + if (bias && bias->size() != G*SC) + THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC; + + // Shapes and Attributes are correct. Can start internal stuff initialization. + + in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc}; + out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc}; + + in_data_d = {{T, N, DC}, memory::f32, memory::tnc};; + out_data_d = {{T, N, SC}, memory::f32, memory::tnc};; + + w_data_d = {{L, D, DC, G, SC}, memory::f32, memory::ldigo}; + w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo}; + + if (bias) + w_bias_d = {{L, D, G, SC}, memory::f32, memory::ldgo}; + + std::vector<TensorDesc> in_candidate; + in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc}); + in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + + std::vector<TensorDesc> out_candidate; + out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + + createDescriptor(in_candidate, out_candidate); +} + +void MKLDNNRNN::fillSeqDesc() { if (!descs.empty()) return; auto rnnLayer = std::dynamic_pointer_cast<RNNLayer>(getCnnLayer()); if (!rnnLayer) THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer."; - if (rnnLayer->cellType == LSTM) - cellr_type = LSTM; - else + if (!one_of(rnnLayer->cellType, "LSTM")) THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell"; - swap_state = rnnLayer->params["swap_state"] == "YES"; + if (!one_of(rnnLayer->axis, 0, 1)) + THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1"; + nativeOrder = rnnLayer->axis == 0; - if (rnnLayer->_axis == 0) - nativeOrder = true; - else if (rnnLayer->_axis == 1) - nativeOrder = false; - else - THROW_IE_EXCEPTION << "RNN layer supports only sequence axis == 1"; + if (!one_of(rnnLayer->direction, RNNLayer::RNN_FWD, RNNLayer::RNN_BWD)) + THROW_IE_EXCEPTION << "RNN layer supports only unidirectional RNN layer"; + direction = ie2mkl(rnnLayer->direction); auto &ins = rnnLayer->insData; auto &outs = rnnLayer->outData; - if (ins.size() != 3 && ins.size() != 1) + if (!one_of(ins.size(), 3, 1)) THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName(); - if (outs.size() != 3 && outs.size() !=1) + if (!one_of(outs.size(), 3, 1)) THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName(); auto in_data_dims = getParentEdgeAt(0)->getDims(); @@ -62,31 +165,21 @@ void MKLDNNRNN::getSupportedDescriptors() { std::swap(out_data_dims[0], out_data_dims[1]); } - // IE specific order - seq = in_data_dims[0]; - batch = in_data_dims[1]; - data_len = in_data_dims[2]; - state_len = out_data_dims[2]; - - const int N = batch; - const int T = seq; - const int G = num_gates; - const int DC = data_len; - const int SC = state_len; - const int L = 1; // What is a L ?? - const int D = 1; - const int S = 2; - - if (out_data_dims != MKLDNNDims {T, N, SC}) - THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); + T = in_data_dims[0]; + N = in_data_dims[1]; + DC = in_data_dims[2]; + SC = out_data_dims[2]; - MKLDNNDims state_dims {batch, state_len}; + MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC}; + + if (out_data_dims != OD_shape) + THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName(); if (ins.size() == 3) { auto state_dims1 = getParentEdgeAt(1)->getDims(); auto stats_dims2 = getParentEdgeAt(2)->getDims(); - if (state_dims1 != state_dims || stats_dims2 != state_dims) + if (state_dims1 != S_shape || stats_dims2 != S_shape) THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName(); in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc}; @@ -96,7 +189,7 @@ void MKLDNNRNN::getSupportedDescriptors() { auto state_dims1 = getChildEdgeAt(1)->getDims(); auto stats_dims2 = getChildEdgeAt(2)->getDims(); - if (state_dims1 != state_dims || stats_dims2 != state_dims) + if (state_dims1 != S_shape || stats_dims2 != S_shape) THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName(); out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc}; @@ -133,8 +226,8 @@ void MKLDNNRNN::getSupportedDescriptors() { in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc}); if (ins.size() == 3) { - in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc}); - in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc}); + in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); } std::vector<TensorDesc> out_candidate; @@ -144,8 +237,8 @@ void MKLDNNRNN::getSupportedDescriptors() { out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc}); if (outs.size() == 3) { - out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc}); - out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc}); + out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); + out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc}); } createDescriptor(in_candidate, out_candidate); @@ -156,7 +249,7 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc, MKLDNNDescriptor desc(std::shared_ptr<rnn_forward::desc>( new rnn_forward::desc(forward_scoring, {algorithm::vanilla_lstm, algorithm::eltwise_tanh }, - unidirectional, + direction, /* In Data */ in_data_d, /* In State */ in_state_d, /* Weights data */ w_data_d, @@ -194,13 +287,8 @@ void MKLDNNRNN::createPrimitive() { std::shared_ptr<rnn_forward::desc> d = descs[0]; rnn_forward::primitive_desc pd(*d, getEngine()); - auto src_data_mem = std::make_shared<MKLDNNMemory>(getEngine()); - src_data_mem->Create(in_data_d, getParentEdgeAt(0)->getMemoryPtr()->GetData()); - internalBlobMemory.push_back(src_data_mem); - - auto dst_data_mem = std::make_shared<MKLDNNMemory>(getEngine()); - dst_data_mem->Create(out_data_d, getChildEdgeAt(0)->getMemoryPtr()->GetData()); - internalBlobMemory.push_back(dst_data_mem); + auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr(); + auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr(); // create weight blobs (data and state part) auto w_data_mem = std::make_shared<MKLDNNMemory>(getEngine()); @@ -229,28 +317,27 @@ void MKLDNNRNN::createPrimitive() { * * Gate order * Caffe - IFOC, ONNX - IOFC - * IE - FICO, mkldnn - FIOC - * + * IE - FICO, mkldnn - IFCO */ - // FICO -> FIOC - const int gate_map[] = {0, 1, 3, 2}; + // FICO -> IFCO + const int gate_map[] = {1, 0, 2, 3}; auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as<const float*>(); auto w_ptr = static_cast<float*>(w_data_mem->GetData()); auto r_ptr = static_cast<float*>(w_state_mem->GetData()); - const int step = state_len * num_gates; + const int step = SC * G; - for (int g = 0; g < num_gates; g++) { - for (int out_i = 0; out_i < state_len; out_i++) { - float *l_w_ptr = w_ptr + gate_map[g]*state_len + out_i; - float *l_r_ptr = r_ptr + gate_map[g]*state_len + out_i; - for (int in_i = 0; in_i < data_len; in_i++) { + for (int g = 0; g < G; g++) { + for (int out_i = 0; out_i < SC; out_i++) { + float *l_w_ptr = w_ptr + gate_map[g]*SC + out_i; + float *l_r_ptr = r_ptr + gate_map[g]*SC+ out_i; + for (int in_i = 0; in_i < DC; in_i++) { *l_w_ptr = *ie_w_ptr; ie_w_ptr++; l_w_ptr += step; } - for (int in_i = 0; in_i < state_len; in_i++) { + for (int in_i = 0; in_i < SC; in_i++) { *l_r_ptr = *ie_w_ptr; ie_w_ptr++; l_r_ptr += step; @@ -261,9 +348,9 @@ void MKLDNNRNN::createPrimitive() { if (w_bias_d) { auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as<const float*>(); auto b_ptr = static_cast<float*>(w_bias_mem->GetData()); - for (int g = 0; g < num_gates; g++) { - float *l_b_ptr = b_ptr + gate_map[g]*state_len; - for (int out_i = 0; out_i < state_len; out_i++) { + for (int g = 0; g < G; g++) { + float *l_b_ptr = b_ptr + gate_map[g]*SC; + for (int out_i = 0; out_i < SC; out_i++) { *l_b_ptr = *ie_b_ptr; ie_b_ptr++; l_b_ptr++; @@ -293,37 +380,35 @@ void MKLDNNRNN::createPrimitive() { src_stat_1.get_primitive_desc().get_size()); internalBlobMemory.push_back(high_half_state_mem); - if (!swap_state) { - exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive()); - exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive()); - } else { - exec_before.emplace_back(src_stat_2, low_half_state_mem->GetPrimitive()); - exec_before.emplace_back(src_stat_1, high_half_state_mem->GetPrimitive()); - } + exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive()); + exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive()); } auto dst_state_mem = std::make_shared<MKLDNNMemory>(getEngine()); dst_state_mem->Create(out_state_d); internalBlobMemory.push_back(dst_state_mem); if (out_state_d) { + int idx_H = is_cell ? 0 : 1; + int idx_C = is_cell ? 1 : 2; /* create copy/split primitive */ - auto dst_stat_1 = getChildEdgeAt(1)->getMemory().GetPrimitive(); - auto dst_stat_2 = getChildEdgeAt(2)->getMemory().GetPrimitive(); + auto dst_stat_1 = getChildEdgeAt(idx_H)->getMemory().GetPrimitive(); + auto dst_stat_2 = getChildEdgeAt(idx_C)->getMemory().GetPrimitive(); auto low_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine()); low_half_state_mem->Create( dst_stat_1.get_primitive_desc().desc(), - src_state_mem->GetPrimitive().get_data_handle()); + dst_state_mem->GetPrimitive().get_data_handle()); internalBlobMemory.push_back(low_half_state_mem); auto high_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine()); high_half_state_mem->Create( dst_stat_2.get_primitive_desc().desc(), - static_cast<uint8_t*>(src_state_mem->GetPrimitive().get_data_handle()) + + static_cast<uint8_t*>(dst_state_mem->GetPrimitive().get_data_handle()) + dst_stat_1.get_primitive_desc().get_size()); internalBlobMemory.push_back(high_half_state_mem); - exec_after.emplace_back(low_half_state_mem->GetPrimitive(), dst_stat_1); + + if (!is_cell) exec_after.emplace_back(low_half_state_mem->GetPrimitive(), dst_stat_1); exec_after.emplace_back(high_half_state_mem->GetPrimitive(), dst_stat_2); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h index a47fdf41c..4399c306a 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -28,18 +27,30 @@ public: void execute(mkldnn::stream strm) override; private: + void fillCellDesc(); + void fillSeqDesc(); + +private: static Register<MKLDNNRNN> reg; - InferenceEngine::CellType cellr_type = InferenceEngine::CellType::LSTM; + /** Specify mode Cell or Seq. true - Cell, false - Seq */ + bool is_cell = false; + /** Native order if [batch, seq, data], other case is [seq, batch, data] */ bool nativeOrder = true; - bool swap_state = false; - int batch = 0; - int seq = 0; - int data_len = 0; - int state_len = 0; - const size_t num_gates = 4; + /** Direction of iteration through sequence dimension */ + mkldnn::rnn_direction direction = mkldnn::unidirectional; + + // Internal attributes + int N = 0; /**< Batch value */ + int T = 0; /**< Sequence value */ + int DC = 0; /**< Input data channel size */ + int SC = 0; /**< State channel size value */ + const int G = 4; /**< Gate size. 4 for LSTM */ + const int L = 1; /**< What is it??. Constant for mkldnn impl */ + const int D = 1; /**< Num of direction. 1 or 2 */ + const int S = 2; /**< Num of state. 2 for LSTM (hidden and sell state). */ MKLDNNMemoryDesc in_data_d; MKLDNNMemoryDesc out_data_d; @@ -51,6 +62,7 @@ private: MKLDNNMemoryDesc w_state_d; MKLDNNMemoryDesc w_bias_d; + // List of in/out reorders if required std::vector<mkldnn::reorder> exec_before; std::vector<mkldnn::reorder> exec_after; }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp index 7d76243f9..4088a1f7a 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h index 401a1c7d3..ca2bafd4f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp index 0738f0054..752172733 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h index 792a634c9..8e199f377 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp index 618479c22..90cf4f401 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -24,16 +23,15 @@ void MKLDNNSplitNode::getSupportedDescriptors() { if (splitLayer == nullptr) THROW_IE_EXCEPTION << "Cannot convert split layer."; - axis = splitLayer->_axis; - - if (axis != 1) - THROW_IE_EXCEPTION << "Split support only axis 1."; - if (getParentEdges().size() != 1) THROW_IE_EXCEPTION << "Incorrect number of input nodes."; if (getChildEdges().empty()) THROW_IE_EXCEPTION << "Incorrect number of output nodes."; + axis = splitLayer->_axis; + if (axis >= getParentEdgeAt(0)->getDims().ndims()) + THROW_IE_EXCEPTION << "Invalid value of axis parameter in split layer"; + // WA. Check applicability and limitations for (size_t i = 1; i < getCnnLayer()->outData.size(); i++) { int num_port_connection = getCnnLayer()->outData[i]->inputTo.size(); @@ -72,7 +70,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { if (srcDims.ndims() < 2) THROW_IE_EXCEPTION << "Split " << getName() << " isn't supported 1d blobs"; - auto num_chanels = 0; + auto axis_size = 0; auto dstFirstDims = getChildEdgeAt(0)->getDims(); for (size_t i = 0; i < outDims.size(); i++) { auto o_Dims = outDims[i]; @@ -83,15 +81,15 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { config.outConfs[i].inPlace = -1; config.outConfs[i].constant = false; config.outConfs[i].desc = MKLDNNMemoryDesc(o_Dims, outputDataType, memory::format::any); - num_chanels += o_Dims[1]; + axis_size += o_Dims[axis]; for (size_t j = 0; j < dstFirstDims.ndims(); j++) { if (j == axis) continue; if (o_Dims[j] != dstFirstDims[j]) - THROW_IE_EXCEPTION << "Split " << getName() << "has incorrect output dimensions"; + THROW_IE_EXCEPTION << "Split " << getName() << " has incorrect output dimensions"; } } - dstFirstDims[1] = num_chanels; + dstFirstDims[axis] = axis_size; if (dstFirstDims.size() != srcDims.size()) THROW_IE_EXCEPTION << "The sizes of input blob and sum of output blobs are not equal."; supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); @@ -99,11 +97,10 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { auto numOfDim = static_cast<size_t>(srcDims.ndims()); SizeVector order; - SizeVector offsets; + SizeVector offsets(numOfDim, 0lu); size_t offset = std::numeric_limits<size_t>::max(); for (size_t i = 0; i < numOfDim; i++) { order.push_back(i); - offsets.push_back(0); } SizeVector strides(numOfDim); @@ -125,23 +122,23 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { } supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); - if (numOfDim != 4) + if ((numOfDim != 4 && numOfDim != 5) || axis != 1) return; - order = {0, 1, 2, 3, 1}; - offsets = {0, 0, 0, 0, 0}; - numOfDim = 5; + order.push_back(1); + numOfDim = order.size(); + offsets = SizeVector(numOfDim, 0lu); // nChw8c and nChw16c - for (int sizeS : {8, 16}) { + for (size_t sizeS : {8lu, 16lu}) { SizeVector blkDims = srcDims.ToSizeVector(); if (blkDims[1] % sizeS) continue; - blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0); + blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); blkDims.push_back(sizeS); strides.resize(numOfDim); - strides[numOfDim - 1] = 1; + strides[numOfDim - 1] = 1lu; for (size_t i = 2; i <= numOfDim; i++) { if (numOfDim - i < axis) { strides[numOfDim - i] = std::numeric_limits<size_t>::max(); @@ -160,9 +157,9 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() { canInplace = false; break; } - blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0); + blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); blkDims.push_back(sizeS); - config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides}); + config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides}); } if (canInplace) supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); @@ -190,18 +187,32 @@ void MKLDNNSplitNode::execute(mkldnn::stream strm) { int MB = batchToProcess(); auto srcBlob = getParentEdgeAt(0)->getBlob(); const auto *srcData = srcBlob->cbuffer().as<const float *>(); + + size_t outerSize = 1; + for (int i = 0; i < axis; i++) { + if (i == 0) + outerSize *= MB; + else + outerSize *= srcBlob->dims()[srcBlob->dims().size() - i - 1]; + } + size_t srcSize = getParentEdgeAt(0)->getMemory().GetSize(); - size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / srcBlob->getTensorDesc().getDims()[0]) + size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / outerSize) - srcBlob->getTensorDesc().offset(0); for (size_t i = 0, sIdx = 0; i < getChildEdges().size(); i++) { auto dstBlob = getChildEdgeAt(i)->getBlob(); auto *dstData = dstBlob->buffer().as<float *>(); - size_t dst_slice_size = dstBlob->size() / dstBlob->getTensorDesc().getDims()[0]; - size_t dst_batch_off = dstBlob->getTensorDesc().offset(dst_slice_size) - dstBlob->getTensorDesc().offset(0); - for (size_t dIdx = 0; dIdx < dst_slice_size; dIdx++, sIdx++) { - for (unsigned b = 0; b < MB; b++) { + size_t innerSize = 1; + for (size_t j = axis; j < dstBlob->dims().size(); j++) { + innerSize *= dstBlob->dims()[dstBlob->dims().size() - j - 1]; + } + + size_t dst_batch_off = dstBlob->getTensorDesc().offset(innerSize) - dstBlob->getTensorDesc().offset(0); + + for (size_t dIdx = 0; dIdx < innerSize; dIdx++, sIdx++) { + for (unsigned b = 0; b < outerSize; b++) { if (sIdx + b*src_batch_off >= srcSize) THROW_IE_EXCEPTION << "Incorrect configuration of split layer " << getName() << "!"; dstData[b * dst_batch_off + dstBlob->getTensorDesc().offset(dIdx)] = @@ -436,3 +447,13 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() { } initDescriptor(config); } + +void MKLDNNSplitNode::setDynamicBatchLim(int lim) { + if (axis == 0) + THROW_IE_EXCEPTION << "Dynamic batch is not supported by split layer with axis == 0 parameter"; + + dynBatchLim = lim; + if (prim) { + prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size()); + } +} diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h index 7d4157768..905f8069c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -26,6 +25,8 @@ public: bool isOptimized(); void initOptimalPrimitiveDescriptor() override; + void setDynamicBatchLim(int lim) override; + private: static Register<MKLDNNSplitNode> reg; size_t axis = 1; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp index 204ea868d..122671681 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // @@ -49,9 +48,11 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() { fmt = memory::format::nc; } else if (inDims.ndims() == 4) { fmt = memory::format::nchw; + } else if (inDims.ndims() == 5) { + fmt = memory::format::ncdhw; } if (fmt == memory::format::any) { - THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2d and 4d dimensions!"; + THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2D, 4D and 5D dimensions!"; } InferenceEngine::LayerConfig config; @@ -101,14 +102,16 @@ void MKLDNNTileNode::execute(mkldnn::stream strm) { m_inner_dim *= batchToProcess(); } - if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%8 == 0 && srcMemory.GetFormat() == memory::nChw8c) { + if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw8c) || + (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw8c))) { /* * We may enable tile processing directly to appropriate output format (nChw8c) */ m_inner_dim *= 8; m_outer_dim /= 8; - } else if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%16 == 0 - && srcMemory.GetFormat() == memory::nChw16c) { + } else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 && + ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw16c) || + (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw16c))) { /* * We may enable tile processing directly to appropriate output format (nChw16c) */ diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h index 464c15017..d6a75941f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/perf_count.h b/inference-engine/src/mkldnn_plugin/perf_count.h index 87f0c5ffc..3770a2435 100644 --- a/inference-engine/src/mkldnn_plugin/perf_count.h +++ b/inference-engine/src/mkldnn_plugin/perf_count.h @@ -1,5 +1,4 @@ // Copyright (C) 2018 Intel Corporation -// // SPDX-License-Identifier: Apache-2.0 // diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp new file mode 100644 index 000000000..24d2931af --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp @@ -0,0 +1,370 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "blob_dump.h" +#include "blob_factory.hpp" +#include "mkldnn_memory.h" + +// It's so bad to include by relative path :-( +#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp" + +#include <fstream> + +using namespace InferenceEngine; + +namespace MKLDNNPlugin { + +// IEB file format routine +static unsigned char IEB_MAGIC[4] = {'I', 'E', 'B', '0'}; +static unsigned char NO_SCALES = 0xFF; + +struct IEB_HEADER { + unsigned char magic[4]; + unsigned char ver[2]; + + unsigned char precision; // 0-8 + unsigned char ndims; + unsigned int dims[7]; // max is 7-D blob + + unsigned char scaling_axis; // FF - no scaling + unsigned char reserved[3]; + + unsigned long data_offset; + unsigned long data_size; + unsigned long scaling_data_offset; + unsigned long scaling_data_size; +}; + +static IEB_HEADER prepare_header(const TensorDesc& desc) { + IEB_HEADER header; + + header.magic[0] = IEB_MAGIC[0]; + header.magic[1] = IEB_MAGIC[1]; + header.magic[2] = IEB_MAGIC[2]; + header.magic[3] = IEB_MAGIC[3]; + + // IEB file format version 0.1 + header.ver[0] = 0; + header.ver[1] = 1; + + header.precision = desc.getPrecision(); + + if (desc.getDims().size() > 7) + THROW_IE_EXCEPTION << "Dumper support max 7D blobs"; + + header.ndims = desc.getDims().size(); + for (int i = 0; i < header.ndims; i++) + header.dims[i] = desc.getDims()[i]; + + header.scaling_axis = NO_SCALES; + + return header; +} + +static TensorDesc parse_header(IEB_HEADER &header) { + if (header.magic[0] != IEB_MAGIC[0] || + header.magic[1] != IEB_MAGIC[1] || + header.magic[2] != IEB_MAGIC[2] || + header.magic[3] != IEB_MAGIC[3]) + THROW_IE_EXCEPTION << "Dumper cannot parse file. Wrong format."; + + if (header.ver[0] != 0 || + header.ver[1] != 1) + THROW_IE_EXCEPTION << "Dumper cannot parse file. Unsupported IEB format version."; + + Precision prc = Precision(static_cast<Precision::ePrecision>(header.precision)); + SizeVector dims(header.ndims); + for (int i = 0; i < header.ndims; i++) + dims[i] = header.dims[i]; + + return TensorDesc {prc, dims, plain_layout(dims)}; +} + + +bool is_plain(Blob::Ptr blob) { + bool res = true; + + auto orig_strides = blob->getTensorDesc().getBlockingDesc().getStrides(); + auto orig_order = blob->getTensorDesc().getBlockingDesc().getOrder(); + auto dims = blob->getTensorDesc().getDims(); + + for (int stride = 1, i = dims.size()-1; i >= 0; --i) { + if (stride != orig_strides[i] || i != orig_order[i]) res = false; + stride *= dims[i]; + } + + return res; +} + +static Blob::Ptr prepare_plain_data(Blob::Ptr blob) { + // check if it already plain + if (is_plain(blob)) return blob; + + Blob::Ptr pln_blob = make_plain_blob(blob->precision(), blob->getTensorDesc().getDims()); + pln_blob->allocate(); + + // Copy to plain + MKLDNNMemoryDesc mdesc(blob->getTensorDesc()); + mkldnn::memory::desc desc = mdesc; + mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data); + + int data_size = blob->size(); + + // TODO: make it with blob_copy utility + switch (blob->precision()) { + case Precision::FP32: + case Precision::I32: { + int32_t *pln_blob_ptr = pln_blob->buffer().as<int32_t*>(); + int32_t *blob_ptr = blob->buffer().as<int32_t*>(); + for (size_t i = 0; i < data_size; i++) + pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)]; + break; + } + case Precision::I16: + case Precision::U16: { + int16_t *pln_blob_ptr = pln_blob->buffer().as<int16_t*>(); + int16_t *blob_ptr = blob->buffer().as<int16_t *>(); + for (size_t i = 0; i < data_size; i++) + pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)]; + break; + } + case Precision::I8: + case Precision::U8: { + int8_t *pln_blob_ptr = pln_blob->buffer().as<int8_t*>(); + int8_t *blob_ptr = blob->buffer().as<int8_t *>(); + for (size_t i = 0; i < data_size; i++) + pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)]; + break; + } + default: + THROW_IE_EXCEPTION << "Dumper. Unsupported precision"; + } + + return pln_blob; +} + +void BlobDumper::dump(std::ostream &stream) { + if (!_blob) + THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob"; + + if (_blob->buffer().as<float*>() == nullptr) + THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated."; + + IEB_HEADER header = prepare_header(_blob->getTensorDesc()); + Blob::Ptr pln_blob = prepare_plain_data(_blob); + + header.data_offset = sizeof(header); + header.data_size = pln_blob->byteSize(); + header.scaling_data_offset = 0; + header.scaling_data_size = 0; + + if (_scales) { + header.scaling_axis = 1; + header.scaling_data_offset = header.data_offset + header.data_size; + header.scaling_data_size = _scales->byteSize(); + } + + stream.write(reinterpret_cast<char*>(&header), sizeof(header)); + stream.write(pln_blob->buffer().as<char*>(), pln_blob->byteSize()); + + if (_scales) { + stream.write(_scales->buffer().as<char*>(), _scales->byteSize()); + } +} + +void BlobDumper::dumpAsTxt(std::ostream &stream) { + if (!_blob) + THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob"; + + if (_blob->buffer().as<float*>() == nullptr) + THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated."; + + SizeVector dims = _blob->getTensorDesc().getDims(); + + // Header like "U8 4D shape: 2 3 224 224 () + stream << _blob->precision().name() << " " + << dims.size() << "D " + << "shape: "; + for (size_t d : dims) stream << d << " "; + stream << "(" << _blob->size() << ")" <<std::endl; + + // Dump data + MKLDNNMemoryDesc mdesc(_blob->getTensorDesc()); + mkldnn::memory::desc desc = mdesc; + mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data); + + int data_size = _blob->size(); + switch (_blob->precision()) { + case Precision::FP32: { + auto *blob_ptr = _blob->buffer().as<float*>(); + for (size_t i = 0; i < data_size; i++) + stream << blob_ptr[blob_wrp.off_l(i)] << std::endl; + break; + } + case Precision::I32: { + auto *blob_ptr = _blob->buffer().as<int32_t*>(); + for (size_t i = 0; i < data_size; i++) + stream << blob_ptr[blob_wrp.off_l(i)] << std::endl; + break; + } + case Precision::I16: { + auto *blob_ptr = _blob->buffer().as<int16_t*>(); + for (size_t i = 0; i < data_size; i++) + stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl; + break; + } + case Precision::U16: { + auto *blob_ptr = _blob->buffer().as<uint16_t*>(); + for (size_t i = 0; i < data_size; i++) + stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl; + break; + } + case Precision::I8: { + auto *blob_ptr = _blob->buffer().as<int8_t*>(); + for (size_t i = 0; i < data_size; i++) + stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl; + break; + } + case Precision::U8: { + auto *blob_ptr = _blob->buffer().as<uint8_t*>(); + for (size_t i = 0; i < data_size; i++) + stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl; + break; + } + default: + THROW_IE_EXCEPTION << "Dumper. Unsupported precision"; + } +} + +BlobDumper BlobDumper::read(std::istream &stream) { + IEB_HEADER header; + stream.read(reinterpret_cast<char*>(&header), sizeof(header)); + + TensorDesc desc = parse_header(header); + Blob::Ptr blob = make_blob_with_precision(desc); + blob->allocate(); + + stream.seekg(header.data_offset, stream.beg); + stream.read(blob->buffer().as<char*>(), header.data_size); + + BlobDumper res(blob); + + // Parse scales fields. + if (header.scaling_axis != NO_SCALES) { + if (header.scaling_axis != 1) + THROW_IE_EXCEPTION << "Dumper support scaling only for channel dims."; + + size_t scl_size = header.scaling_data_size / sizeof(float); + auto scl = make_blob_with_precision({Precision::FP32, {scl_size}, C}); + scl->allocate(); + + stream.seekg(header.scaling_data_offset, stream.beg); + stream.read(scl->buffer().as<char*>(), header.scaling_data_size); + + res._scales = scl; + } + return res; +} + +BlobDumper BlobDumper::read(const std::string &file_path) { + std::ifstream file; + file.open(file_path); + if (!file.is_open()) + THROW_IE_EXCEPTION << "Dumper cannot open file " << file_path; + + auto res = read(file); + file.close(); + return res; +} + +void BlobDumper::dump(const std::string &dump_path) { + std::ofstream dump_file; + dump_file.open(dump_path); + if (!dump_file.is_open()) + THROW_IE_EXCEPTION << "Dumper cannot create dump file"; + + dump(dump_file); + dump_file.close(); +} + +void BlobDumper::dumpAsTxt(const std::string dump_path) { + std::ofstream dump_file; + dump_file.open(dump_path); + if (!dump_file.is_open()) + THROW_IE_EXCEPTION << "Dumper cannot create dump file"; + + dumpAsTxt(dump_file); + dump_file.close(); +} + +Blob::Ptr BlobDumper::get() { + return _blob; +} + +template <typename data_t> +static void plain_copy(const Blob::Ptr &from, const Blob::Ptr &scls, Blob::Ptr &to) { + auto dims = from->getTensorDesc().getDims(); + + size_t data_size = from->size(); + size_t outer_size = dims[0]; + size_t c_size = dims.size() > 1 ? dims[1] : 1; + size_t inner_size = dims.size() == 4 ? dims[2]*dims[3] : + dims.size() == 3 ? dims[2] : 1; + + auto to_data = to->buffer().as<float*>(); + auto from_data = from->buffer().as<data_t*>(); + + if (scls) { + auto scls_data = scls->buffer().as<float*>(); + + for (size_t o=0; o < outer_size; o++) + for (size_t c=0; c < c_size; c++) + for (size_t i=0; i < inner_size; i++) + *to_data++ = static_cast<float>(*from_data++) * scls_data[c]; + } else { + for (size_t i=0; i < data_size; i++) + *to_data++ = static_cast<float>(*from_data++); + } +} + +Blob::Ptr BlobDumper::getRealValue() { + if (_blob->precision() == Precision::FP32 && !_scales) + return _blob; + + auto res = make_plain_blob(Precision::FP32, _blob->getTensorDesc().getDims()); + res->allocate(); + + switch (_blob->precision()) { + case Precision::U8: plain_copy<uint8_t>(_blob, _scales, res); break; + case Precision::FP32: plain_copy<float>(_blob, _scales, res); break; + case Precision::I8: plain_copy<int8_t >(_blob, _scales, res); break; + default: THROW_IE_EXCEPTION << "Unsupported precesion for getRealValue method."; + } + + return res; +} + + +BlobDumper& BlobDumper::withScales(InferenceEngine::Blob::Ptr scales) { + if ( _blob->getTensorDesc().getDims().size() < 2 || + scales->getTensorDesc().getDims().size() != 1 || + scales->getTensorDesc().getDims()[0] != _blob->getTensorDesc().getDims()[1] || + scales->getTensorDesc().getPrecision() != Precision::FP32) + THROW_IE_EXCEPTION << "Dumper cannot use passed scales. Blob has incompatible shape."; + + _scales = scales; + return *this; +} + +BlobDumper& BlobDumper::withoutScales() { + _scales.reset(); + return *this; +} + + +const InferenceEngine::Blob::Ptr& BlobDumper::getScales() const { + return _scales; +} + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.h b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h new file mode 100644 index 000000000..4130d53a7 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h @@ -0,0 +1,50 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_blob.h" + +#include <string> + +namespace MKLDNNPlugin { + +/** + * Utility class to dump blob contant in plain format. + * Every layout information will be lost. + * + * In case of low precision blob it allow to store + * with using scaling factors per channel. + * NB! Channel is a second dimension for all blob types. + */ +class BlobDumper { + InferenceEngine::Blob::Ptr _blob; + InferenceEngine::Blob::Ptr _scales; + +public: + BlobDumper() = default; + BlobDumper(const BlobDumper&) = default; + BlobDumper& operator = (BlobDumper&&) = default; + + explicit BlobDumper(const InferenceEngine::Blob::Ptr blob):_blob(blob) {} + + static BlobDumper read(const std::string &file_path); + static BlobDumper read(std::istream &stream); + + void dump(const std::string &file_path); + void dump(std::ostream &stream); + + void dumpAsTxt(const std::string file_path); + void dumpAsTxt(std::ostream &stream); + + BlobDumper& withScales(InferenceEngine::Blob::Ptr scales); + BlobDumper& withoutScales(); + + const InferenceEngine::Blob::Ptr& getScales() const; + + InferenceEngine::Blob::Ptr get(); + InferenceEngine::Blob::Ptr getRealValue(); +}; + +} // namespace MKLDNNPlugin |