inference-engine/thirdparty/clDNN/api/CPP/cldnn_defs.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

/*
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/

/*! @mainpage clDNN Documentation
* @section intro Introduction
* Compute Library for Deep Neural Networks (clDNN) is a middle-ware software
*  for accelerating DNN inference on Intel&reg; HD and Iris&trade; Pro Graphics.
*  This project includes CNN primitives implementations on Intel GPUs with C and C++ interfaces.
*
* clDNN Library implements set of primitives:
*  - Convolution
*  - Fully connected (inner product)
*  - Pooling
*   * average
*   * maximum
*  - Normalization
*   * across channel
*   * within channel
*   * batch
*  - Activation
*   * logistic
*   * tanh
*   * rectified linear unit (ReLU)
*   * softplus (softReLU)
*   * abs
*   * square
*   * sqrt
*   * linear
*  - Softmax
*  - Crop
*  - Deconvolution
*  - Depth concatenation
*  - Eltwise
*  - ROI pooling
*  - Simpler NMS
*  - Prior box
*  - Detection output
*
*  With this primitive set, user can build and execute most common image recognition, semantic segmentation and object detection networks topologies like:
*   - Alexnet 
*   - Googlenet(v1-v3)
*   - ResNet
*   - VGG
*   - faster-rCNN 
* and other.
*  
*
* @section model Programming Model
*  Intel&reg; clDNN is graph oriented library. To execute CNN you have to build, compile graph/topology and run to get results. 
*  
*  <B> Terminology: </B>
*  - Primitive - dnn base functionality i.e. convolution, pooling, softmax. 
*  - Data - special primitive type representing primitive parameters (weights and biases), inputs and outputs
*  - Engine - type of accelerator that is executing network. Currently ocl engine is the only available. 
*  - Topology - container of primitives, data, and relations between them. Topology represents graph. 
*  - Program - optional step between Topology and Network. It is compiled Topology without memory allocation.
*  - Network - compiled Topology with memory allocation. Ready to be executed. During compilation, buidling parameters trigger special optimizations like fusing, data reordering.
*
*  <B> Execution Steps: </B>
*
* \image html workflow.jpg
* -# Create Engine
* -# Declare or define primitives parameters (weights and biases) if needed.
* -# Create primitives. It is required to provide name for each primitive. This is a name of primitive which output will be input to current one. Name can be used before primitive definition.
* -# Create topology
* -# Add primitives to topology
* -# Build Network from topology
* -# Set Inputs data 
* -# Execute Network
*
*
* @section graph_compilation Graph compilation
*
* If user choose build option optimize_data when program is being created - explicit or implicit over network creation, clDNN perform some graph optimizations as follows:
* * <B> Stage 0: Graph initiation:</B>
*  * build nodes from primitives
*  * node replacement: 
*   * replace each split node with series of crop nodes. Name of crop primitive will be concatenation of split + port names. 
*   * replace upsampling node with deconvolution node if upsampling mode is bilinear.
*  * set outputs - mark nodes that are defined by user as output (blocks fusing etc) or have no users (leafs).
*  * calculate processing order - using dfs on graph to establish processing order
* * <B> Stage 1: Priorboxes:</B>
*  * priorbox is primitive that is executed during network compilation. Node is removed from a network execution.
* * <B> Stage 2: Graph analysis:</B>
*  * mark constatns
*  * mark data flow
* * <B> Stage 3: Trimming:</B>
*  * apply backward bfs on each output to find unnecessary nodes/branches, then remove those. 
* * <B> Stage 4: Inputs and biases:</B>
*  * reorder input - format of convolution's input/output is being selected. 
*  * reorder biases for conv,fc and deconv nodes
* * <B> Stage 5: Redundant reorders:</B>
*  * previous stages can provide additional reorders due to format changes per primitive. This stage removes redundant and fuses series of reorders into one.  
* * <B> Stage 6: Constant propagation:</B>
*  * prepare padding - goes thrugh all primitves and checks if its user requires padding, if so, set output padding.
*  * prepare depthwise separable opt - if split param is greater than 16 and number of IFM <= 8*split in conv or deconv, this stage changes execution from multi kernels into one.
*  * constant propagation - replace constant nodes, that are not outputs with data type nodes. Constant primitive is the primitive that doesn't depend on any non-constant primitive and doesn't have to be executed: priorbox, data.
* * <B> Stage 7: Fusing:</B>
*  * buffer fusing
*   * concat - if concatenation is the only user of its dependencies then remove concat node and setting proper output paddings in every dependencies. 
*   * crop - if crop has only one dependecy, and its users doesn't require padding, remove crop and set proper output padding in its dependecy.
*   * reorder - if primitive before reorder supports different input vs output type reorder can be fused with previous node.
*  * primitive fusing - right now this stage fuses activation node with previous node only, only if previous node supports activation fusing. 
* * <B> Stage 8: Compile graph:</B>
*  * at this stage using kernel selector, graph chooses the best kernel implementation for each node.
* * <B> Stage 9: reorder weights:</B>
*  * at this stage weights are converted into format suitable for selected kernel implementation.
* * <B> Stage 10 & 11: Redundant reorders and constant propagation:</B>
*  * check again if whole graph compilation didn't provide any redundant reorders and constants.
* * <B> Stage 12: Compile program:</B>
*  * at this stage engine compiles cl_kernels. 
*
* @section example C++ API Example MNIST network
* @include example_cldnn.cpp
*/

///////////////////////////////////////////////////////////////////////////////////////////////////
#pragma once

#include <functional>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>
#include <stdexcept>

#include "../C/cldnn.h"

namespace cldnn {
    // There is no portable half precision floating point support.
    // Using wrapped integral type with the same size and alignment restrictions.
    class half_impl
    {
    public:
        half_impl() = default;
        template <typename T, typename = typename std::enable_if<!std::is_floating_point<T>::value>::type>
        explicit half_impl(T data) : _data(data) {}

        operator uint16_t() const { return _data; }
        operator float() const
        {
           cldnn_status status = CLDNN_SUCCESS;
           auto value = cldnn_half_to_float(_data, &status);
           if (status != CLDNN_SUCCESS)
               throw std::runtime_error("Conversion from half failed");
           return value;
        }
        explicit half_impl(float value)
        {
            cldnn_status status = CLDNN_SUCCESS;
            _data = cldnn_float_to_half(value, &status);
            if (status != CLDNN_SUCCESS)
                throw std::runtime_error("Conversion to half failed");
        }

    private:
        uint16_t _data;
    };
}
// Use complete implementation if necessary.
#if defined HALF_HALF_HPP
typedef half half_t;
#else
typedef cldnn::half_impl half_t;
#endif

namespace cldnn {
/// @addtogroup cpp_api C++ API
/// @{

/// @defgroup cpp_error Error Handling
/// @{

using status_t = ::cldnn_status;

/// @brief clDNN specific exception type.
class error : public std::runtime_error
{
public:
    explicit error(const std::string& _Message, status_t status = CLDNN_ERROR)
        : runtime_error(_Message)
        , _status(status)
    {
    }

    explicit error(const char* _Message, status_t status = CLDNN_ERROR)
        : runtime_error(_Message)
        , _status(status)
    {
    }

    /// @brief Returns clDNN status code.
    const status_t& status() const { return _status; }
private:
    status_t _status;
};

#define CLDNN_THROW(msg, status) throw cldnn::error(msg, status);

template<class T>
T check_status(std::string err_msg, std::function<T(status_t*)> func)
{
    status_t status = CLDNN_SUCCESS;
    auto result = func(&status);
    if (status != CLDNN_SUCCESS)
        CLDNN_THROW(err_msg.append(": ").append(cldnn_get_last_error_message()), status);
    return result;
}

template<>
inline void check_status<void>(std::string err_msg, std::function<void(status_t*)> func)
{
    status_t status = CLDNN_SUCCESS;
    func(&status);
    if (status != CLDNN_SUCCESS)
        CLDNN_THROW(err_msg.append(": ").append(cldnn_get_last_error_message()), status);
}

/// @}

/// @defgroup cpp_version Version Information
/// @{

using version_t = ::cldnn_version;

/// @brief Get information about version of clDNN.
inline version_t get_version()
{
    return check_status<version_t>("get_version: fetching version information failed",
                                   [](status_t* status)
                                   {
                                       return ::cldnn_get_version(status);
                                   });
}

/// @}

/// @cond CPP_HELPERS

/// @defgroup cpp_helpers Helpers
/// @{

#define CLDNN_API_CLASS(the_class) static_assert(std::is_standard_layout<the_class>::value, #the_class " has to be 'standart layout' class");


template<typename T>
typename std::enable_if<std::is_integral<T>::value, T>::type align_to(T size, size_t align) {
    return static_cast<T>((size % align == 0) ? size : size - size % align + align);
}

template<typename T>
typename std::enable_if<std::is_integral<T>::value, T>::type pad_to(T size, size_t align) {
    return static_cast<T>((size % align == 0) ? 0 : align - size % align);
}

template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type is_aligned_to(T size, size_t align)
{
    return !(size % align);
}

/// Computes ceil(@p val / @p divider) on unsigned integral numbers.
///
/// Computes division of unsigned integral numbers and rounds result up to full number (ceiling).
/// The function works for unsigned integrals only. Signed integrals are converted to corresponding
/// unsigned ones.
///
/// @tparam T1   Type of @p val. Type must be integral (SFINAE).
/// @tparam T2   Type of @p divider. Type must be integral (SFINAE).
///
/// @param val       Divided value. If value is signed, it will be converted to corresponding unsigned type.
/// @param divider   Divider value. If value is signed, it will be converted to corresponding unsigned type.
///
/// @return   Result of ceil(@p val / @p divider). The type of result is determined as if in normal integral
///           division, except each operand is converted to unsigned type if necessary.
template <typename T1, typename T2>
constexpr auto ceil_div(T1 val, T2 divider)
    -> typename std::enable_if<std::is_integral<T1>::value && std::is_integral<T2>::value,
                               decltype(std::declval<typename std::make_unsigned<T1>::type>() / std::declval<typename std::make_unsigned<T2>::type>())>::type
{
    typedef typename std::make_unsigned<T1>::type UT1;
    typedef typename std::make_unsigned<T2>::type UT2;
    typedef decltype(std::declval<UT1>() / std::declval<UT2>()) RetT;

    return static_cast<RetT>((static_cast<UT1>(val) + static_cast<UT2>(divider) - 1U) / static_cast<UT2>(divider));
}

/// Rounds @p val to nearest multiply of @p rounding that is greater or equal to @p val.
///
/// The function works for unsigned integrals only. Signed integrals are converted to corresponding
/// unsigned ones.
///
/// @tparam T1       Type of @p val. Type must be integral (SFINAE).
/// @tparam T2       Type of @p rounding. Type must be integral (SFINAE).
///
/// @param val        Value to round up. If value is signed, it will be converted to corresponding unsigned type.
/// @param rounding   Rounding value. If value is signed, it will be converted to corresponding unsigned type.
///
/// @return   @p val rounded up to nearest multiply of @p rounding. The type of result is determined as if in normal integral
///           division, except each operand is converted to unsigned type if necessary.
template <typename T1, typename T2>
constexpr auto round_up_to(T1 val, T2 rounding)
    -> typename std::enable_if<std::is_integral<T1>::value && std::is_integral<T2>::value,
                               decltype(std::declval<typename std::make_unsigned<T1>::type>() / std::declval<typename std::make_unsigned<T2>::type>())>::type
{
    typedef typename std::make_unsigned<T1>::type UT1;
    typedef typename std::make_unsigned<T2>::type UT2;
    typedef decltype(std::declval<UT1>() / std::declval<UT2>()) RetT;

    return static_cast<RetT>(ceil_div(val, rounding) * static_cast<UT2>(rounding));
}

///
/// \brief Converts C API float array to std::vector<float>
///
inline std::vector<float> float_arr_to_vector(const cldnn_float_arr& arr)
{
    std::vector<float> result(arr.size);
    for (size_t i = 0; i < arr.size; i++)
    {
        result[i] = arr.data[i];
    }
    return result;
}

///
/// \brief Converts C API float array to std::vector<uint16_t>
///
inline std::vector<uint16_t> uint16_t_arr_to_vector(const cldnn_uint16_t_arr& arr)
{
    std::vector<uint16_t> result(arr.size);
    for (size_t i = 0; i < arr.size; i++)
    {
        result[i] = arr.data[i];
    }
    return result;
}


///
/// \brief Converts std::vector<float> to C API float_array
///
inline cldnn_float_arr float_vector_to_arr(const std::vector<float>& stor)
{
    return { stor.data(), stor.size() };
}

///
/// \brief Converts std::vector<uint16_t> to C API float_array
///
inline cldnn_uint16_t_arr uint16_t_vector_to_arr(const std::vector<uint16_t>& stor)
{
    return{ stor.data(), stor.size() };
}

///
/// \brief Converts std::vector<tensor> to C API tensor_array
///
inline cldnn_tensor_arr tensor_vector_to_arr(const std::vector<cldnn_tensor>& stor)
{
    return cldnn_tensor_arr{ stor.data(), stor.size() };
}

/// @}

/// @endcond

/// @}
}