summaryrefslogtreecommitdiff
path: root/compute/ARMComputeEx/src/runtime/CL
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ARMComputeEx/src/runtime/CL')
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp20
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp120
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp39
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp60
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp36
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp48
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp39
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp147
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp151
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp28
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp311
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp238
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp67
19 files changed, 1475 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
new file mode 100644
index 000000000..158fe0b0c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/CLFunctionsEx.h"
+
+// NOTE This empty file aims to validate "CLFunctionsEx.h".
+// DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
new file mode 100644
index 000000000..ae64a6edd
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgOperation::CLArgOperation()
+{
+ // DO NOTHING
+}
+
+void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+ ArgOperation op)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
+ _input = input;
+ _output = output;
+ _axis = axis;
+ _arg_op = op;
+ // NOTE The argminmax_axis must have no duplication.
+ _num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _argop_kernels =
+ arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
+
+ TensorShape shape{input->info()->tensor_shape()};
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(_axis[i], 1);
+ _interm_tensors[i].allocator()->init(
+ TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
+ .set_data_layout(input->info()->data_layout()));
+ _interm_tensors[i].allocator()->allocate();
+ }
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Apply ArgMinMax on all kernels
+ for (size_t i = 0; i < _num_of_kernels; i++)
+ {
+ _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
+ }
+}
+
+Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+ const ITensorInfo *output, ArgOperation op)
+{
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ shape.set(axis[i], 1);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; i++)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate argminmax only on all kernels
+ for (size_t i = 0; i < num_of_kernels; i++)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
+ }
+
+ return Status{};
+}
+
+void CLArgOperation::run()
+{
+ for (size_t i = 0; i < _num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_argop_kernels[i]);
+ }
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+ BinaryLogicalOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+ k->configure(input1, input2, output, op);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 000000000..742fc6f59
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+using namespace arm_compute;
+
+void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+ k->configure(input, output, input_subtype);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+ const ICLTensor *lookups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..c6b166163
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
+
+using namespace arm_compute;
+
+void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
+ const arm_compute::ICLTensor *weights,
+ const arm_compute::ICLTensor *biases,
+ arm_compute::ICLTensor *output, bool needs_reshape,
+ const arm_compute::TensorShape &reshape)
+{
+ _input = input;
+ _weights = weights;
+ _biases = biases;
+ _output = output;
+ _needs_reshape = needs_reshape;
+
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_cl_buffer.info(),
+ _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
+ _input->info()->data_layout()));
+ _cl_reshape.configure(_input, &_cl_buffer);
+
+ _cl_fc.configure(&_cl_buffer, _weights, _biases, _output);
+
+ // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_buffer.allocator()->allocate();
+ }
+ else
+ {
+ _cl_fc.configure(_input, _weights, _biases, _output);
+ }
+}
+
+void CLFullyConnectedReshapingLayer::run(void)
+{
+ if (_needs_reshape)
+ _cl_reshape.run();
+
+ _cl_fc.run();
+}
+
+void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
new file mode 100644
index 000000000..6cad9bd2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+using namespace arm_compute;
+
+void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+ int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ return CLGatherExKernel::validate(input, indices, output, axis);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+ const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..86ea5a66d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+
+void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+ ICLTensor *gamma, ICLTensor *beta, float epsilon)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+ k->configure(input, output, gamma, beta, epsilon);
+ _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma, const ITensorInfo *beta,
+ float epsilon)
+{
+ return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+ k->configure(input, alpha, output);
+ _kernel = std::move(k);
+
+ if (output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+ if (broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
new file mode 100644
index 000000000..2a34c0664
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+ _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+ _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+ const ITensorInfo *hidden_state, const ITensorInfo *output,
+ const ActivationLayerInfo &info)
+{
+ const int idx_width = 0;
+ const int idx_height = 1;
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+ output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+ recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+ recurrent_weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ hidden_state->tensor_shape());
+
+ auto shape_info =
+ TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+ input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
+ ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+ const ICLTensor *recurrent_weights, const ICLTensor *bias,
+ ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
+ recurrent_weights->info(), bias->info(),
+ hidden_state->info(), output->info(), info));
+
+ const int idx_height = 1;
+ TensorShape shape =
+ compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _is_prepared = false;
+
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
+ &_add_output, ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayerEx::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ _fully_connected_kernel.run();
+ _gemm_state_f.run();
+ CLScheduler::get().enqueue(_add_kernel);
+ CLScheduler::get().enqueue(_activation_kernel);
+
+ // copy hidden out to output
+ CLScheduler::get().enqueue(_copy_kernel);
+
+ _memory_group.release();
+}
+
+void CLRNNLayerEx::prepare()
+{
+ if (!_is_prepared)
+ {
+ _fully_connected_kernel.prepare();
+ _gemm_state_f.prepare();
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..13a25c901
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+ _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const std::set<uint32_t> &axis, bool keep_dims,
+ const ReduceOperation &op)
+{
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+ // Create temporary tensor infos
+ auto interm_tensors =
+ arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ auto it = axis.begin();
+ for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+ {
+ shape.set(*it, 1, false);
+ interm_tensors[i].set_data_type(input->data_type());
+ interm_tensors[i].set_tensor_shape(shape);
+ interm_tensors[i].set_num_channels(input->num_channels());
+ interm_tensors[i].set_data_layout(input->data_layout());
+ interm_tensors[i].set_quantization_info(input->quantization_info());
+ }
+
+ // Set a vector that is ordered ITensorInfo sequentially.
+ std::vector<const ITensorInfo *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; ++i)
+ {
+ tensors.emplace_back(interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Validate ReduceOperation only on all kernels
+ it = axis.begin();
+ for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+ }
+
+ if (!keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+ }
+
+ return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+ const std::set<uint32_t> &axis, bool keep_dims,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
+
+ _axis = axis;
+
+ _input = input;
+ _output = output;
+ _keep_dims = keep_dims;
+
+ // NOTE The axis must have no duplication.
+ const size_t num_of_kernels = axis.size();
+ const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+ _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _reduce_kernels =
+ arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+ // Set a vector that is ordered ICLTensors sequentially.
+ std::vector<ICLTensor *> tensors;
+ tensors.emplace_back(input);
+ for (size_t i = 0; i < num_of_interm_tensors; ++i)
+ {
+ tensors.emplace_back(_interm_tensors.get() + i);
+ }
+ tensors.emplace_back(output);
+
+ // Apply ReduceOperation on all kernels
+ TensorShape shape{input->info()->tensor_shape()};
+ auto it = axis.begin();
+ for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+ {
+ shape.set(*it, 1, false);
+ if (!keep_dims || i != (num_of_kernels - 1))
+ {
+ _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ _memory_group.manage(&_interm_tensors[i]);
+ }
+ _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+ if (i != 0)
+ {
+ _interm_tensors[i - 1].allocator()->allocate();
+ }
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output);
+ _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate();
+ }
+}
+
+void CLReduceOperation::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ const size_t num_of_kernels = _axis.size();
+ for (size_t i = 0; i < num_of_kernels; ++i)
+ {
+ CLScheduler::get().enqueue(_reduce_kernels[i]);
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+ const ICLTensor *padding_size, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+ k->configure(input, block_size, padding_size, output);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+ k->configure(input, output, block_size);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..80d50ad94
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+ : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+ _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+ _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+ _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+ _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+ _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+ _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+ _reorder_negatives_kernel(), _store_kernel()*/
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+ int total_bits, int bits)
+{
+ _total_bits = total_bits;
+ _bits = bits;
+ _n = input->info()->tensor_shape()[0];
+
+ // _total_bits should be divided by _bits.
+ ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+ _k = k;
+ _radix = 1 << bits;
+
+ _input = input;
+ _values = values;
+ _indices = indices;
+
+ std::string topk_env;
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+// Invalid result on GPU
+#if 0
+ char *env = getenv("ACL_TOPKV2");
+ if (env)
+ topk_env = env;
+
+ if (topk_env == "GPU_SINGLE")
+ {
+ _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+ _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+ _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+ }
+ else if (topk_env == "GPU")
+ {
+ // n should be divided by (_GROUPS * _ITEMS)
+ ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+ _hist_buf_size = _radix * _GROUPS * _ITEMS;
+ _glob_sum_buf_size = _HISTOSPLIT;
+
+ _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _hist_buf_size);
+ _glob_sum_buf =
+ cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _glob_sum_buf_size);
+ _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ sizeof(cl_int) * _glob_sum_buf_size);
+ _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+ _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+ _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+ _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+ _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+ _p_in_key_buf = &_in_key_buf;
+ _p_out_key_buf = &_out_key_buf;
+ _p_in_ind_buf = &_in_ind_buf;
+ _p_out_ind_buf = &_out_ind_buf;
+
+ _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+ _hist_kernel.configure(&_hist_buf, bits, _n);
+ _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+ _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+ _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+ _reorder_kernel.configure(&_hist_buf, bits, _n);
+ _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+ _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+ _store_kernel.configure(values, indices, k, _n);
+ }
+ else
+#endif // Disable GPU implementation
+ {
+ // DO NOTHING for CPU.
+ }
+}
+
+void CLTopKV2::run()
+{
+ std::string topk_env;
+#if 0
+ char *env = getenv("ACL_TOPKV2");
+ if (env)
+ topk_env = env;
+
+ if (topk_env == "GPU_SINGLE")
+ {
+ run_on_gpu_single_quicksort();
+ }
+ else if (topk_env == "GPU")
+ {
+ run_on_gpu();
+ }
+ else
+#endif
+ {
+ run_on_cpu();
+ }
+}
+
+#if 0
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+ // This is a single threaded quick sort implementation.
+ CLScheduler::get().enqueue(_qs_kernel, false);
+
+ arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ // 1. CLTopKV2Init set key buffer and index buffer.
+ // - Key buffer is set as the same value of the layer's input
+ // - Values in the index buffer are set as their indices.
+ CLScheduler::get().enqueue(_init_kernel, false);
+
+ int n_passes = _total_bits / _bits;
+
+ // 2. Repeat (total_bits/bits) times.
+ // - total_bits is the number of bits of the data type (e.g., 32 for float)
+ // - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+ for (int pass = 0; pass < n_passes; ++pass)
+ {
+ arm_compute::CLScheduler::get().sync();
+
+ // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+ _hist_kernel.setPass(pass, _p_in_key_buf);
+ CLScheduler::get().enqueue(_hist_kernel, false);
+
+ // 2.2. Calculate prefix sum locally with multiple threads
+ CLScheduler::get().enqueue(_scan_hist_kernel, false);
+ // 2.3. Calculate prefix sum within a work group
+ CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+ // 2.4. Calculate global prefix sum
+ CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+ // 2.5. Reorder keys and indices based on the global prefix sum
+ _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+ CLScheduler::get().enqueue(_reorder_kernel, false);
+
+ cl::Buffer *tmp;
+ // swap key buffers
+ tmp = _p_in_key_buf;
+ _p_in_key_buf = _p_out_key_buf;
+ _p_out_key_buf = tmp;
+
+ // swap index buffers
+ tmp = _p_in_ind_buf;
+ _p_in_ind_buf = _p_out_ind_buf;
+ _p_out_ind_buf = tmp;
+ }
+
+ // 3. Get the first negative index
+ // Because we swap in_buf and out_buf at the end of the above for loop,
+ // the output buffers are in bufs.
+ _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+ CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+ // 4. Correct odering of negatives
+ // - Since radix sort does not consider negatives, negatives are considered as bigger values
+ // than positives.
+ // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+ _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+ _p_out_ind_buf);
+ CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+ // 5. Extract top k values from sorted keys and indices.
+ _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+ CLScheduler::get().enqueue(_store_kernel, false);
+
+ arm_compute::CLScheduler::get().sync();
+
+#if 0
+ // below code is left for debugging.
+ int first_neg;
+ q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+ std::cout << "first neg = " << first_neg << std::endl;
+
+ float in_key[_n];
+ q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+ }
+
+ float out_key[_n];
+ q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+ }
+
+ int in_ind[_n];
+ q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+ }
+
+ int out_ind[_n];
+ q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+ for(uint32_t i = 0 ; i < _n; ++i) {
+ std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+ }
+
+ int hist_buf[_hist_buf_size];
+ q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+ for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+ std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+ }
+
+ int glob_sum_buf[_glob_sum_buf_size];
+ q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+ for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+ std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+ }
+
+#endif
+}
+#endif // Disable GPU implementation
+
+void CLTopKV2::run_on_cpu()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+ // const Window& w = _topkv2_kernel.window();
+
+ _input->map(q);
+ _values->map(q);
+ _indices->map(q);
+
+ // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+ int row_size = _input->info()->tensor_shape()[0];
+ int rank = _input->info()->num_dimensions();
+
+ if (rank > 2)
+ throw std::runtime_error("Not supported type.");
+
+ int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+ if (_input->info()->data_type() == DataType::F32)
+ {
+ nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(), (float *)_values->buffer());
+ }
+ else if (_input->info()->data_type() == DataType::S32)
+ {
+ nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(),
+ (int32_t *)_values->buffer());
+ }
+ else if (_input->info()->data_type() == DataType::QASYMM8)
+ {
+ nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+ (int32 *)_indices->buffer(),
+ (uint8_t *)_values->buffer());
+ }
+ else
+ {
+ throw std::runtime_error("Not supported type.");
+ }
+
+ _input->unmap(q);
+ _values->unmap(q);
+ _indices->unmap(q);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
new file mode 100644
index 000000000..40e21671d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _scale_f(),
+ _conv_f(),
+ _flip_weights(),
+ _scaled_output(),
+ _original_weights(nullptr),
+ _weights_flipped(),
+ _is_prepared(false)
+{
+}
+
+Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, ITensorInfo *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ const DataLayout data_layout = input->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+ const unsigned int kernel_x = weights->dimension(idx_w);
+ const unsigned int kernel_y = weights->dimension(idx_h);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+ "invalid_right must be smaller than kernel_x");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+ "inner_border_top must be smaller than kernel_y");
+
+ // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+ auto out_dims = transposeconv_output_dimensions(
+ input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+ weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+ if (bias != nullptr)
+ {
+ if (is_data_type_quantized_asymmetric(input->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+ "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+ "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+ "Output's depth is invalid.");
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+ pad_bottom);
+ TensorInfo scale_out_info(input->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(scale_out_shape)
+ .set_data_layout(data_layout));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+ conv_info, weights_info));
+
+ return Status{};
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+ ICLTensor *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom,
+ const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ _original_weights = weights;
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped);
+
+ // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
+ // added.
+ auto out_dims = transposeconv_output_dimensions(
+ input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+ invalid_bottom);
+
+ const TensorShape output_shape =
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(
+ *output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
+
+ _is_prepared = weights_info.retain_internal_weights();
+
+ _memory_group.manage(&_scaled_output);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+ // to match output shape
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // configure scale function
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::FLOOR);
+ _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+ _scaled_output.allocator()->allocate();
+}
+
+void CLTransposeConvLayer::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ _scale_f.run();
+ _conv_f.run();
+
+ _memory_group.release();
+}
+
+void CLTransposeConvLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ _weights_flipped.map(true);
+ _original_weights->map(CLScheduler::get().queue(), true);
+ CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _weights_flipped.unmap();
+ _original_weights->unmap(CLScheduler::get().queue());
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ if (!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
new file mode 100644
index 000000000..0ce3e6700
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+ : _upsample(),
+ _output(nullptr)
+{
+}
+
+Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+ const BorderSize &inner_border,
+ const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ _output = output;
+ _upsample.configure(input, _output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::run()
+{
+ _output->map(CLScheduler::get().queue(), true);
+ if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+ {
+ const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+ std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+ }
+ else
+ {
+ memset(_output->buffer(), 0, _output->info()->total_size());
+ }
+ _output->unmap(CLScheduler::get().queue());
+
+ CLScheduler::get().enqueue(_upsample, false);
+}