diff options
Diffstat (limited to 'inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp')
-rw-r--r-- | inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp | 44 |
1 files changed, 30 insertions, 14 deletions
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp index 093ec49dd..c00f8ad0f 100644 --- a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp +++ b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp @@ -47,7 +47,13 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() { int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; - auto ker = [&](const int ithr, const int nthr) { + if (conf_.want_padded_bias()) { + for (int oc = 0; oc < jcp.oc_without_padding; ++oc) + padded_bias_[oc] = bias[oc]; + bias = padded_bias_; + } + + parallel(0, [&](const int ithr, const int nthr) { size_t start{ 0 }, end{ 0 }; balance211(work_amount, nthr, ithr, start, end); @@ -59,7 +65,7 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() { icb_step = icb_step_rem; size_t n{0}, g{0}, ocbb{0}, oh{0}; - nd_iterator_init(start, n, MB, g, jcp.ngroups, ocbb, ocb_work, + nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); for (size_t iwork = start; iwork < end; ++iwork) { int ocb = ocbb * jcp.nb_oc_blocking; @@ -98,7 +104,7 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() { par_conv.flags |= FLAG_IC_FIRST; } - if (jcp.with_eltwise && icb + 1 == jcp.nb_ic) { + if (icb + 1 == jcp.nb_ic) { par_conv.flags |= FLAG_IC_LAST; } @@ -110,6 +116,9 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() { - div_up(i_t_overflow, (jcp.dilate_h + 1)) - div_up(i_b_overflow, (jcp.dilate_h + 1)); par_conv.kh_padding = nstl::max(0, kh_padding); + + par_conv.oc_off = _oc * jcp.oc_block * sizeof(float); + kernel_->jit_ker(&par_conv); } nd_iterator_step(n, MB, g, jcp.ngroups, ocbb, ocb_work, @@ -117,12 +126,7 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() { } icbb += icb_step; } - }; - -#pragma omp parallel - { - ker(omp_get_thread_num(), omp_get_num_threads()); - } + }); } template <bool with_relu> @@ -140,6 +144,8 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward_fusing() { const auto &jcp_dw = kernel_dw_->jcp; int MB = conf_.MB(); + auto dw_bias = jcp.dw_conv_biases; + int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = MB * jcp.ngroups * ocb_work * jcp.oh; @@ -187,7 +193,7 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward_fusing() { par_conv.flags |= FLAG_IC_FIRST; } - if (jcp.with_eltwise && icb + 1 == jcp.nb_ic) { + if (icb + 1 == jcp.nb_ic) { par_conv.flags |= FLAG_IC_LAST; } @@ -199,6 +205,9 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward_fusing() { - div_up(i_t_overflow, (jcp.dilate_h + 1)) - div_up(i_b_overflow, (jcp.dilate_h + 1)); par_conv.kh_padding = nstl::max(0, kh_padding); + + par_conv.oc_off = _oc * jcp.oc_block * sizeof(float); + kernel_->jit_ker(&par_conv); } } @@ -222,7 +231,7 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward_fusing() { par_conv_dw.kh_padding = jcp_dw.kh; par_conv_dw.filt = &jcp.dw_conv_weights[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block]; - par_conv_dw.bias = &jcp.dw_conv_biases[chb * jcp_dw.ch_block]; + par_conv_dw.bias = &dw_bias[chb * jcp_dw.ch_block]; par_conv_dw.ur_w = (size_t)(jcp_dw.ow); kernel_dw_->jit_ker(&par_conv_dw); @@ -261,10 +270,17 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward_fusing() { } }; - #pragma omp parallel - { - ker(omp_get_thread_num(), omp_get_num_threads()); + if (conf_.want_padded_bias()) { + for (int oc = 0; oc < jcp.oc_without_padding; ++oc) + padded_bias_[oc] = bias[oc]; + bias = padded_bias_; + + for (int oc = 0; oc < jcp.oc_without_padding; ++oc) + dw_padded_bias_[oc] = dw_bias[oc]; + dw_bias = dw_padded_bias_; } + + parallel(0, ker); } template void _jit_sse42_convolution_fwd_t<true>::execute_forward(); |