summaryrefslogtreecommitdiff
path: root/lib/jpegli/upsample.cc
blob: 5559aa78a65e6b342a3a640c5097bfdfc255602f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "lib/jpegli/upsample.h"

#include <string.h>

#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
#include <hwy/foreach_target.h>
#include <hwy/highway.h>

HWY_BEFORE_NAMESPACE();
namespace jpegli {
namespace HWY_NAMESPACE {

// These templates are not found via ADL.
using hwy::HWY_NAMESPACE::Mul;
using hwy::HWY_NAMESPACE::MulAdd;
using hwy::HWY_NAMESPACE::Vec;

#if HWY_CAP_GE512
using hwy::HWY_NAMESPACE::Half;
using hwy::HWY_NAMESPACE::Vec;
template <size_t i, class DF, class V>
HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
  using HF = Half<DF>;
  using HHF = Half<HF>;
  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
}

template <class DF, class V>
HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
  using HF = Half<DF>;
  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
}

#endif

// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
// aligned.
template <class DF, class V, typename T>
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
#if HWY_TARGET == HWY_SCALAR
  Store(v0, df, mem);
  Store(v1, df, mem + 1);
#elif !HWY_CAP_GE256
  Store(InterleaveLower(df, v0, v1), df, mem);
  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
#else
  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
    auto t0 = InterleaveLower(df, v0, v1);
    auto t1 = InterleaveUpper(df, v0, v1);
    Store(ConcatLowerLower(df, t1, t0), df, mem);
    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
  } else {
#if HWY_CAP_GE512
    auto t0 = InterleaveLower(df, v0, v1);
    auto t1 = InterleaveUpper(df, v0, v1);
    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
          df, mem);
    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
          df, mem + Lanes(df));
#endif
  }
#endif
}

void Upsample2Horizontal(float* JXL_RESTRICT row,
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
  HWY_FULL(float) df;
  auto threefour = Set(df, 0.75f);
  auto onefour = Set(df, 0.25f);
  const size_t len_in = (len_out + 1) >> 1;
  memcpy(scratch_space, row, len_in * sizeof(row[0]));
  scratch_space[-1] = scratch_space[0];
  scratch_space[len_in] = scratch_space[len_in - 1];
  for (size_t x = 0; x < len_in; x += Lanes(df)) {
    auto current = Mul(Load(df, scratch_space + x), threefour);
    auto prev = LoadU(df, scratch_space + x - 1);
    auto next = LoadU(df, scratch_space + x + 1);
    auto left = MulAdd(onefour, prev, current);
    auto right = MulAdd(onefour, next, current);
    StoreInterleaved(df, left, right, row + x * 2);
  }
}

void Upsample2Vertical(const float* JXL_RESTRICT row_top,
                       const float* JXL_RESTRICT row_mid,
                       const float* JXL_RESTRICT row_bot,
                       float* JXL_RESTRICT row_out0,
                       float* JXL_RESTRICT row_out1, size_t len) {
  HWY_FULL(float) df;
  auto threefour = Set(df, 0.75f);
  auto onefour = Set(df, 0.25f);
  for (size_t x = 0; x < len; x += Lanes(df)) {
    auto it = Load(df, row_top + x);
    auto im = Load(df, row_mid + x);
    auto ib = Load(df, row_bot + x);
    auto im_scaled = Mul(im, threefour);
    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
  }
}

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace jpegli
HWY_AFTER_NAMESPACE();

#if HWY_ONCE
namespace jpegli {

HWY_EXPORT(Upsample2Horizontal);
HWY_EXPORT(Upsample2Vertical);

void Upsample2Horizontal(float* JXL_RESTRICT row,
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
  return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out);
}

void Upsample2Vertical(const float* JXL_RESTRICT row_top,
                       const float* JXL_RESTRICT row_mid,
                       const float* JXL_RESTRICT row_bot,
                       float* JXL_RESTRICT row_out0,
                       float* JXL_RESTRICT row_out1, size_t len) {
  return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
                                                 row_out0, row_out1, len);
}
}  // namespace jpegli
#endif  // HWY_ONCE