1 files changed, 307 insertions, 0 deletions
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
new file mode 100644
index 000000000..cd576cec1
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+#include <vector>
+
+using namespace arm_compute;
+
+static const int32_t maxDims = 4;
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axisSize - 1] that can be used to index
+// directly into the data.
+inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices,
+                            std::vector<int32_t> const &strides, const TensorShape &inputShape,
+                            int32_t axis)
+{
+  // Begin with the specified index
+  int32_t start = startIndices[axis];
+
+  // beginMask override
+  if (beginMask & 1 << axis)
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axisSize-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int32_t>::lowest();
+    }
+    else
+    {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int32_t>::max();
+    }
+  }
+
+  // Handle negative indices
+  int32_t axisSize = inputShape[axis];
+  if (start < 0)
+  {
+    start += axisSize;
+  }
+
+  // Clamping
+  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
+                           std::vector<int32_t> const &strides, const TensorShape &inputShape,
+                           int32_t axis)
+{
+  // Begin with the specified index
+  int32_t stop = stopIndices[axis];
+
+  // endMask override
+  if (endMask & (1 << axis))
+  {
+    if (strides[axis] > 0)
+    {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int32_t>::max();
+    }
+    else
+    {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int32_t>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int32_t axisSize = inputShape[axis];
+  if (stop < 0)
+  {
+    stop += axisSize;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0)
+  {
+    // Forward iteration
+    stop = arm_compute::utility::clamp(stop, 0, axisSize);
+  }
+  else
+  {
+    // Backward iteration
+    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
+  }
+
+  return stop;
+}
+
+inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
+{
+  int32_t offset = b * shape[2] * shape[1] * shape[0];
+  offset += d * shape[1] * shape[0];
+  offset += h * shape[0];
+  offset += w;
+  return offset;
+}
+
+void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                               ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+                               int32_t endMask, int32_t shrinkAxisMask)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
+  _kernel = std::move(k);
+}
+
+void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                                  ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+                                  int32_t endMask, int32_t shrinkAxisMask)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(
+      input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(),
+      beginMask, endMask, shrinkAxisMask));
+
+  _input = input;
+  _output = output;
+  _beginData = beginData;
+  _endData = endData;
+  _stridesData = stridesData;
+  _beginMask = beginMask;
+  _endMask = endMask;
+  _shrinkAxisMask = shrinkAxisMask;
+}
+
+void CLStridedSliceCPU::run()
+{
+  run_on_cpu();
+
+  arm_compute::CLScheduler::get().sync();
+}
+
+inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
+{
+  if (stride > 0)
+  {
+    return ((stop - start - 1) / stride) + 1;
+  }
+  else
+  {
+    return ((stop - start + 1) / stride) + 1;
+  }
+}
+
+template <typename T>
+inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask,
+                         int32_t endMask, const std::vector<int32_t> &startIndices,
+                         const std::vector<int32_t> &stopIndices,
+                         const std::vector<int32_t> &strides, T *outputData)
+{
+  ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
+  ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
+  ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
+
+  const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
+  const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
+  const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
+  const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
+  const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
+  const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
+  const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
+  const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
+
+  // The shape of outputData may collapse in one-dimension.
+  // Therefore, it is necessary to create a shape that matches the result of the outputData.
+  TensorShape outputShape(
+      getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
+      getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
+  for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b;
+       in_b += strides[3], b++)
+  {
+    for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d;
+         in_d += strides[2], d++)
+    {
+      for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h;
+           in_h += strides[1], h++)
+      {
+        for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w;
+             in_w += strides[0], w++)
+        {
+          outputData[offset4D(outputShape, b, d, h, w)] =
+              inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
+        }
+      }
+    }
+  }
+}
+
+void CLStridedSliceCPU::run_on_cpu()
+{
+  // TODO: Support shrinkAxisMask
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  _input->map(q);
+  _output->map(q);
+  _beginData->map(q);
+  _endData->map(q);
+  _stridesData->map(q);
+
+  TensorShape inputShape = _input->info()->tensor_shape();
+  TensorShape outputShape = _output->info()->tensor_shape();
+
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
+
+  for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx)
+  {
+    starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
+    stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
+    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
+  }
+
+  for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++)
+  {
+    starts.emplace_back(0);
+    stops.emplace_back(1);
+    strides.emplace_back(1);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case DataType::U8:
+    case DataType::QASYMM8:
+      StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<uint8_t *>(_output->buffer()));
+      break;
+    case DataType::S8:
+    case DataType::QS8:
+      StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
+      break;
+    case DataType::U16:
+      StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<uint16_t *>(_output->buffer()));
+      break;
+    case DataType::S16:
+    case DataType::QS16:
+      StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<int16_t *>(_output->buffer()));
+      break;
+    case DataType::F16:
+      // Not sure this works.
+      StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
+      break;
+    case DataType::U32:
+      StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<uint32_t *>(_output->buffer()));
+      break;
+    case DataType::S32:
+      StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides,
+                   reinterpret_cast<int32_t *>(_output->buffer()));
+      break;
+    case DataType::F32:
+      StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask,
+                   _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  _input->unmap(q);
+  _output->unmap(q);
+  _beginData->unmap(q);
+  _endData->unmap(q);
+  _stridesData->unmap(q);
+}