summaryrefslogtreecommitdiff
path: root/runtimes/nn/depend/external/gemmlowp/profiling/profiler.h
blob: a18c036c81d56ba69d7b8f29c00dc31df7eaebd3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// profiler.h: a simple sampling profiler that's always just one #include away!
//
// Overview
// ========
//
// This profiler only samples a pseudo-stack, not the actual call stack.
// The code to be profiled needs to be instrumented with
// pseudo-stack "labels", see ScopedProfilingLabel.
// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
// portable, and independent of compilation details such as function inlining
// and frame pointers. The granularity of instrumentation can be freely chosen,
// and it is possible to get some annotate-like detail, i.e. detail within one
// function without splitting it into multiple functions.
//
// This profiler should remain small and simple; its key feature is to fit in
// a single header file so that there should never be a reason to refrain
// from profiling. More complex and feature-rich alternatives are
// readily available. This one offers a strict superset of its
// functionality: https://github.com/bgirard/GeckoProfiler, including
// intertwining pseudostacks with real call stacks, more annotation options,
// and advanced visualization.
//
// Usage
// =====
//
// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
//    not enabled, profiling instrumentation from instrumentation.h
//    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
//    is still defined but does nothing. On the other hand,
//    when profiling is not enabled, it is an error to #include the
//    present file.
//
// 1. Each thread can opt in to profiling by calling
//    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
//    This can be done at any time, before or during profiling.
//    No sample will be collected from a thread until
//    it has called RegisterCurrentThreadForProfiling().
//
// 2. Instrument your code to be profiled with ScopedProfilingLabel,
//    which is a RAII helper defined in instrumentation.h. The identifier
//    names (some_label, etc) do not matter; what will show up
//    in the profile is the string passed to the constructor, which
//    must be a literal string. See the full example below.
//
//    Note: the overhead of ScopedProfilingLabel is zero when not
//    enabling profiling (when not defining GEMMLOWP_PROFILING).
//
// 3. Use the profiler.h interface to control profiling. There are two
//    functions: StartProfiling() and FinishProfiling(). They must be
//    called on the same thread. FinishProfiling() prints the profile
//    on stdout.
//
// Full example
// ============
/*
    #define GEMMLOWP_PROFILING
    #include "profiling/instrumentation.h"
    using namespace gemmlowp;

    const int iters = 100000000;
    volatile int i;

    void Bar() {
      ScopedProfilingLabel label("Bar");
      for (i = 0; i < iters; i++) {}
    }

    void Foo() {
      ScopedProfilingLabel label("Foo");
      for (i = 0; i < iters; i++) {}
      Bar();
    }

    void Init() {
      RegisterCurrentThreadForProfiling();
    }

    #include "profiling/profiler.h"

    int main() {
      Init();
      StartProfiling();
      Foo();
      FinishProfiling();
    }
*
* Output:
*
    gemmlowp profile (1 threads, 304 samples)
    100.00% Foo
        51.32% other
        48.68% Bar
    0.00% other (outside of any label)
*/
//
// Interpreting results
// ====================
//
//  Each node shows the absolute percentage, among all the samples,
//  of the number of samples that recorded the given pseudo-stack.
//  The percentages are *NOT* relative to the parent node. In addition
//  to your own labels, you will also see 'other' nodes that collect
//  the remainder of samples under the parent node that didn't fall into
//  any of the labelled child nodes. Example:
//
//  20% Foo
//      12% Bar
//      6% Xyz
//      2% other
//
//  This means that 20% of all labels were under Foo, of which 12%/20%==60%
//  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
//  under either Bar or Xyz.
//
//  Typically, one wants to keep adding ScopedProfilingLabel's until
//  the 'other' nodes show low percentages.
//
// Interpreting results with multiple threads
// ==========================================
//
// At each sample, each thread registered for profiling gets sampled once.
// So if there is one "main thread" spending its time in MainFunc() and
// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
// samples will be in WorkerFunc, so the profile will look like this:
//
// 80% WorkerFunc
// 20% MainFunc

#ifndef GEMMLOWP_PROFILING_PROFILER_H_
#define GEMMLOWP_PROFILING_PROFILER_H_

#ifndef GEMMLOWP_PROFILING
#error Profiling is not enabled!
#endif

#include <vector>

#include "instrumentation.h"

namespace gemmlowp {

// A tree view of a profile.
class ProfileTreeView {
  struct Node {
    std::vector<Node*> children;
    const char* label;
    std::size_t weight;
    Node() : label(nullptr), weight(0) {}
    ~Node() {
      for (auto child : children) {
        delete child;
      }
    }
  };

  static bool CompareNodes(Node* n1, Node* n2) {
    return n1->weight > n2->weight;
  }

  Node root_;

  void PrintNode(const Node* node, int level) const {
    if (level) {
      for (int i = 1; i < level; i++) {
        printf("    ");
      }
      printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
    }
    for (auto child : node->children) {
      PrintNode(child, level + 1);
    }
  }

  static void AddStackToNode(const ProfilingStack& stack, Node* node,
                             std::size_t level) {
    node->weight++;
    if (stack.size == level) {
      return;
    }
    Node* child_to_add_to = nullptr;
    for (auto child : node->children) {
      if (child->label == stack.labels[level]) {
        child_to_add_to = child;
        break;
      }
    }
    if (!child_to_add_to) {
      child_to_add_to = new Node;
      child_to_add_to->label = stack.labels[level];
      node->children.push_back(child_to_add_to);
    }
    AddStackToNode(stack, child_to_add_to, level + 1);
    return;
  }

  void AddStack(const ProfilingStack& stack) {
    AddStackToNode(stack, &root_, 0);
  }

  void AddOtherChildrenToNode(Node* node) {
    std::size_t top_level_children_weight = 0;
    for (auto c : node->children) {
      AddOtherChildrenToNode(c);
      top_level_children_weight += c->weight;
    }
    if (top_level_children_weight) {
      Node* other_child = new Node;
      other_child->label =
          node == &root_ ? "other (outside of any label)" : "other";
      other_child->weight = node->weight - top_level_children_weight;
      node->children.push_back(other_child);
    }
  }

  void AddOtherNodes() { AddOtherChildrenToNode(&root_); }

  void SortNode(Node* node) {
    std::sort(node->children.begin(), node->children.end(), CompareNodes);
    for (auto child : node->children) {
      SortNode(child);
    }
  }

  void Sort() { SortNode(&root_); }

 public:
  explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
    for (auto stack : stacks) {
      AddStack(stack);
    }
    AddOtherNodes();
    Sort();
  }

  void Print() const {
    printf("\n");
    printf("gemmlowp profile (%d threads, %d samples)\n",
           static_cast<int>(ThreadsUnderProfiling().size()),
           static_cast<int>(root_.weight));
    PrintNode(&root_, 0);
    printf("\n");
  }
};

// This function is the only place that determines our sampling frequency.
inline void WaitOneProfilerTick() {
  static const int millisecond = 1000000;

#if defined __arm__ || defined __aarch64__
  // Reduced sampling frequency on mobile devices helps limit time and memory
  // overhead there.
  static const int interval = 10 * millisecond;
#else
  static const int interval = 1 * millisecond;
#endif

  timespec ts;
  ts.tv_sec = 0;
  ts.tv_nsec = interval;
  nanosleep(&ts, nullptr);
}

// This is how we track whether we've already started profiling,
// to guard against misuse of the API.
inline bool& IsProfiling() {
  static bool b;
  return b;
}

// This is how we tell the profiler thread to finish.
inline bool& ProfilerThreadShouldFinish() {
  static bool b;
  return b;
}

// The profiler thread. See ProfilerThreadFunc.
inline pthread_t& ProfilerThread() {
  static pthread_t t;
  return t;
}

// Records a stack from a running thread.
// The tricky part is that we're not interrupting the thread.
// This is OK because we're looking at a pseudo-stack of labels,
// not at the real thread stack, and if the pseudo-stack changes
// while we're recording it, we are OK with getting either the
// old or the new stack. Note that ProfilingStack::Pop
// only decrements the size, and doesn't null the popped label,
// so if we're concurrently recording it, it shouldn't change
// under our feet until another label is pushed, at which point
// we are OK with getting either this new label or the old one.
// In the end, the key atomicity property that we are relying on
// here is that pointers are changed atomically, and the labels
// are pointers (to literal strings).
inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) {
  assert(!dst->size);
  while (dst->size < thread->stack.size) {
    dst->labels[dst->size] = thread->stack.labels[dst->size];
    dst->size++;
    MemoryBarrier();  // thread->stack can change at any time
  }
}

// The profiler thread's entry point.
// Note that a separate thread is to be started each time we call
// StartProfiling(), and finishes when we call FinishProfiling().
// So here we only need to handle the recording and reporting of
// a single profile.
inline void* ProfilerThreadFunc(void*) {
  assert(ProfilerThread() == pthread_self());

  // Since we only handle one profile per profiler thread, the
  // profile data (the array of recorded stacks) can be a local variable here.
  std::vector<ProfilingStack> stacks;

  while (!ProfilerThreadShouldFinish()) {
    WaitOneProfilerTick();
    {
      AutoGlobalLock<ProfilerLockId> lock;
      for (auto t : ThreadsUnderProfiling()) {
        ProfilingStack s;
        RecordStack(t, &s);
        stacks.push_back(s);
      }
    }
  }

  // Profiling is finished and we now report the results.
  ProfileTreeView(stacks).Print();

  return nullptr;
}

// Starts recording samples.
inline void StartProfiling() {
  AutoGlobalLock<ProfilerLockId> lock;
  ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
  IsProfiling() = true;
  ProfilerThreadShouldFinish() = false;
  pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
}

// Stops recording samples, and prints a profile tree-view on stdout.
inline void FinishProfiling() {
  {
    AutoGlobalLock<ProfilerLockId> lock;
    ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
    // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
    // as the scary comments below should make clear.
    // Should we use a condition variable?
    ProfilerThreadShouldFinish() = true;
  }  // must release the lock here to avoid deadlock with profiler thread.
  pthread_join(ProfilerThread(), nullptr);
  IsProfiling() = false;  // yikes, this should be guarded by the lock!
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_PROFILING_PROFILER_H_