diff options
Diffstat (limited to 'runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc')
-rw-r--r-- | runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc | 137 |
1 files changed, 65 insertions, 72 deletions
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc index 089c783c1..ae740bb10 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "internal/layers/SimpleEmbeddingLookup.h" #include <arm_compute/runtime/CL/CLScheduler.h> @@ -6,7 +21,8 @@ void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values, ::arm_compute::ITensor *output) { - // Assume that verification of operands are already done at Planner::visit() + assert(values->info()->num_dimensions() == output->info()->num_dimensions()); + assert(values->info()->num_dimensions() > 1 && values->info()->num_dimensions() <= 4); _lookups = lookups; _values = values; _output = output; @@ -25,85 +41,62 @@ void SimpleEmbeddingLookup::run() // type of elements of lookups is always integer const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer()); - const auto values_buf = _values->buffer(); - auto output_buf = _output->buffer(); const auto lookups_info = _lookups->info(); const auto values_info = _values->info(); const auto output_info = _output->info(); - // TODO Refactor below duplicated code! - const auto values_rank = values_info->num_dimensions(); - switch (values_rank) + // NOTE The first dimension's position is always at the end of dimensions. + const auto first_dim_pos = values_info->num_dimensions() - 1; + + const size_t first_dim = values_info->dimension(first_dim_pos); + for (size_t i = 0; i < lookups_info->dimension(0); ++i) { - case 2: - // (H,W) in nnapi -> (W,H) in acl - { - const size_t row_size = values_info->dimension(1); - const size_t row_bytes = values_info->total_size() / row_size; - for (size_t i = 0; i < lookups_info->dimension(0); ++i) - { - if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) - throw std::runtime_error("Embedding Lookup: index out of bounds."); - - size_t idx = lookups_buf[i]; - size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx}); - size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i}); - - unsigned char *sink_addr = output_buf + row_offset_by_i; - unsigned char *source_addr = values_buf + row_offset_by_idx; - memcpy(sink_addr, source_addr, row_bytes); - } - } - break; - case 3: - // (B,H,W) in nnapi -> (W,H,B) in acl - { - const size_t row_size = values_info->dimension(2); - const size_t row_bytes = values_info->total_size() / row_size; - for (size_t i = 0; i < lookups_info->dimension(0); ++i) - { - if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) - throw std::runtime_error("Embedding Lookup: index out of bounds."); - - size_t idx = lookups_buf[i]; - size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx}); - size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i}); - - unsigned char *sink_addr = output_buf + row_offset_by_i; - unsigned char *source_addr = values_buf + row_offset_by_idx; - memcpy(sink_addr, source_addr, row_bytes); - } - } - break; - case 4: - // (N,H,W,C) in nnapi -> (N,C,H,W) in acl - { - const size_t row_size = values_info->dimension(3); - const size_t row_bytes = values_info->total_size() / row_size; - for (size_t i = 0; i < lookups_info->dimension(0); ++i) - { - if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) - throw std::runtime_error("Embedding Lookup: index out of bounds."); - - size_t idx = lookups_buf[i]; - size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx}); - size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i}); - - unsigned char *sink_addr = output_buf + row_offset_by_i; - unsigned char *source_addr = values_buf + row_offset_by_idx; - memcpy(sink_addr, source_addr, row_bytes); - } - } - break; - case 1: - // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If - // row size is 1, this op is not needed and it means this situtation could be wrong. - throw std::runtime_error("Wrong usage of EmbeddingLookup op!"); - default: - throw std::runtime_error("Not supported rank!"); + if (lookups_buf[i] < 0 || lookups_buf[i] >= first_dim) + throw std::runtime_error("Embedding Lookup: index out of bounds."); } + // If each strides of values and output are different, applied padding size of the two tensors are + // different, therefore, it can not be copied at once. + auto can_copy_at_once = [&]() -> bool { + const auto &values_strides = values_info->strides_in_bytes(); + const auto &output_strides = output_info->strides_in_bytes(); + + for (size_t i = 0; i < first_dim_pos; ++i) + { + if (values_strides[i] != values_strides[i]) + return false; + } + + return true; + }; + + using ::arm_compute::Window; + using ::arm_compute::Iterator; + + size_t copy_bytes; + Window window; + if (can_copy_at_once()) + { + copy_bytes = values_info->total_size() / first_dim; + window.use_tensor_dimensions(output_info->tensor_shape(), first_dim_pos); + } + else + { + copy_bytes = values_info->dimension(0) * values_info->element_size(); + window.use_tensor_dimensions(output_info->tensor_shape(), Window::DimY); + } + + Iterator it(_output, window); + execute_window_loop(window, + [&](const ::arm_compute::Coordinates &id) { + ::arm_compute::Coordinates values_id = id; + const int idx = id[first_dim_pos]; + values_id.set(first_dim_pos, lookups_buf[idx]); + memcpy(it.ptr(), _values->ptr_to_element(values_id), copy_bytes); + }, + it); + if (::internal::arm_compute::isGpuMode()) { auto &q = ::arm_compute::CLScheduler::get().queue(); |