summaryrefslogtreecommitdiff
path: root/lib/jxl/dct-inl.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/jxl/dct-inl.h')
-rw-r--r--lib/jxl/dct-inl.h77
1 files changed, 41 insertions, 36 deletions
diff --git a/lib/jxl/dct-inl.h b/lib/jxl/dct-inl.h
index 5326060..cb6c54b 100644
--- a/lib/jxl/dct-inl.h
+++ b/lib/jxl/dct-inl.h
@@ -154,12 +154,12 @@ struct DCT1DImpl;
template <size_t SZ>
struct DCT1DImpl<1, SZ> {
- JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+ JXL_INLINE void operator()(float* JXL_RESTRICT mem, float*) {}
};
template <size_t SZ>
struct DCT1DImpl<2, SZ> {
- JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+ JXL_INLINE void operator()(float* JXL_RESTRICT mem, float*) {
auto in1 = Load(FV<SZ>(), mem);
auto in2 = Load(FV<SZ>(), mem + SZ);
Store(Add(in1, in2), FV<SZ>(), mem);
@@ -169,14 +169,12 @@ struct DCT1DImpl<2, SZ> {
template <size_t N, size_t SZ>
struct DCT1DImpl {
- void operator()(float* JXL_RESTRICT mem) {
- // This is relatively small (4kB with 64-DCT and AVX-512)
- HWY_ALIGN float tmp[N * SZ];
+ void operator()(float* JXL_RESTRICT mem, float* JXL_RESTRICT tmp) {
CoeffBundle<N / 2, SZ>::AddReverse(mem, mem + N / 2 * SZ, tmp);
- DCT1DImpl<N / 2, SZ>()(tmp);
+ DCT1DImpl<N / 2, SZ>()(tmp, tmp + N * SZ);
CoeffBundle<N / 2, SZ>::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ);
CoeffBundle<N, SZ>::Multiply(tmp);
- DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ);
+ DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, tmp + N * SZ);
CoeffBundle<N / 2, SZ>::B(tmp + N / 2 * SZ);
CoeffBundle<N, SZ>::InverseEvenOdd(tmp, mem);
}
@@ -188,7 +186,7 @@ struct IDCT1DImpl;
template <size_t SZ>
struct IDCT1DImpl<1, SZ> {
JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
- size_t to_stride) {
+ size_t to_stride, float* JXL_RESTRICT) {
StoreU(LoadU(FV<SZ>(), from), FV<SZ>(), to);
}
};
@@ -196,7 +194,7 @@ struct IDCT1DImpl<1, SZ> {
template <size_t SZ>
struct IDCT1DImpl<2, SZ> {
JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
- size_t to_stride) {
+ size_t to_stride, float* JXL_RESTRICT) {
JXL_DASSERT(from_stride >= SZ);
JXL_DASSERT(to_stride >= SZ);
auto in1 = LoadU(FV<SZ>(), from);
@@ -209,74 +207,79 @@ struct IDCT1DImpl<2, SZ> {
template <size_t N, size_t SZ>
struct IDCT1DImpl {
void operator()(const float* from, size_t from_stride, float* to,
- size_t to_stride) {
+ size_t to_stride, float* JXL_RESTRICT tmp) {
JXL_DASSERT(from_stride >= SZ);
JXL_DASSERT(to_stride >= SZ);
- // This is relatively small (4kB with 64-DCT and AVX-512)
- HWY_ALIGN float tmp[N * SZ];
CoeffBundle<N, SZ>::ForwardEvenOdd(from, from_stride, tmp);
- IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ);
+ IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ, tmp + N * SZ);
CoeffBundle<N / 2, SZ>::BTranspose(tmp + N / 2 * SZ);
- IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ);
+ IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ,
+ tmp + N * SZ);
CoeffBundle<N, SZ>::MultiplyAndAdd(tmp, to, to_stride);
}
};
template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
-void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp,
+ float* JXL_RESTRICT tmp) {
size_t M = M_or_0 != 0 ? M_or_0 : Mp;
constexpr size_t SZ = MaxLanes(FV<M_or_0>());
- HWY_ALIGN float tmp[N * SZ];
for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
// TODO(veluca): consider removing the temporary memory here (as is done in
// IDCT), if it turns out that some compilers don't optimize away the loads
// and this is performance-critical.
CoeffBundle<N, SZ>::LoadFromBlock(from, i, tmp);
- DCT1DImpl<N, SZ>()(tmp);
+ DCT1DImpl<N, SZ>()(tmp, tmp + N * SZ);
CoeffBundle<N, SZ>::StoreToBlockAndScale(tmp, to, i);
}
}
template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
-void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp,
+ float* JXL_RESTRICT tmp) {
size_t M = M_or_0 != 0 ? M_or_0 : Mp;
constexpr size_t SZ = MaxLanes(FV<M_or_0>());
for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
IDCT1DImpl<N, SZ>()(from.Address(0, i), from.Stride(), to.Address(0, i),
- to.Stride());
+ to.Stride(), tmp);
}
}
template <size_t N, size_t M, typename = void>
struct DCT1D {
template <typename FromBlock, typename ToBlock>
- void operator()(const FromBlock& from, const ToBlock& to) {
- return DCT1DWrapper<N, M>(from, to, M);
+ void operator()(const FromBlock& from, const ToBlock& to,
+ float* JXL_RESTRICT tmp) {
+ return DCT1DWrapper<N, M>(from, to, M, tmp);
}
};
template <size_t N, size_t M>
struct DCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
template <typename FromBlock, typename ToBlock>
- void operator()(const FromBlock& from, const ToBlock& to) {
- return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M);
+ void operator()(const FromBlock& from, const ToBlock& to,
+ float* JXL_RESTRICT tmp) {
+ return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M,
+ tmp);
}
};
template <size_t N, size_t M, typename = void>
struct IDCT1D {
template <typename FromBlock, typename ToBlock>
- void operator()(const FromBlock& from, const ToBlock& to) {
- return IDCT1DWrapper<N, M>(from, to, M);
+ void operator()(const FromBlock& from, const ToBlock& to,
+ float* JXL_RESTRICT tmp) {
+ return IDCT1DWrapper<N, M>(from, to, M, tmp);
}
};
template <size_t N, size_t M>
struct IDCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
template <typename FromBlock, typename ToBlock>
- void operator()(const FromBlock& from, const ToBlock& to) {
- return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to,
- M);
+ void operator()(const FromBlock& from, const ToBlock& to,
+ float* JXL_RESTRICT tmp) {
+ return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M,
+ tmp);
}
};
@@ -290,15 +293,16 @@ struct ComputeScaledDCT {
HWY_MAYBE_UNUSED void operator()(const From& from, float* to,
float* JXL_RESTRICT scratch_space) {
float* JXL_RESTRICT block = scratch_space;
+ float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS;
if (ROWS < COLS) {
- DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS));
+ DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS), tmp);
Transpose<ROWS, COLS>::Run(DCTFrom(block, COLS), DCTTo(to, ROWS));
- DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS));
+ DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS), tmp);
Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(to, COLS));
} else {
- DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS));
+ DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS), tmp);
Transpose<ROWS, COLS>::Run(DCTFrom(to, COLS), DCTTo(block, ROWS));
- DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS));
+ DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS), tmp);
}
}
};
@@ -312,16 +316,17 @@ struct ComputeScaledIDCT {
HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to,
float* JXL_RESTRICT scratch_space) {
float* JXL_RESTRICT block = scratch_space;
+ float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS;
// Reverse the steps done in ComputeScaledDCT.
if (ROWS < COLS) {
Transpose<ROWS, COLS>::Run(DCTFrom(from, COLS), DCTTo(block, ROWS));
- IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS));
+ IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS), tmp);
Transpose<COLS, ROWS>::Run(DCTFrom(from, ROWS), DCTTo(block, COLS));
- IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to);
+ IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to, tmp);
} else {
- IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS));
+ IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS), tmp);
Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(from, COLS));
- IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to);
+ IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to, tmp);
}
}
};