diff options
Diffstat (limited to 'lib/jxl/dct-inl.h')
-rw-r--r-- | lib/jxl/dct-inl.h | 77 |
1 files changed, 41 insertions, 36 deletions
diff --git a/lib/jxl/dct-inl.h b/lib/jxl/dct-inl.h index 5326060..cb6c54b 100644 --- a/lib/jxl/dct-inl.h +++ b/lib/jxl/dct-inl.h @@ -154,12 +154,12 @@ struct DCT1DImpl; template <size_t SZ> struct DCT1DImpl<1, SZ> { - JXL_INLINE void operator()(float* JXL_RESTRICT mem) {} + JXL_INLINE void operator()(float* JXL_RESTRICT mem, float*) {} }; template <size_t SZ> struct DCT1DImpl<2, SZ> { - JXL_INLINE void operator()(float* JXL_RESTRICT mem) { + JXL_INLINE void operator()(float* JXL_RESTRICT mem, float*) { auto in1 = Load(FV<SZ>(), mem); auto in2 = Load(FV<SZ>(), mem + SZ); Store(Add(in1, in2), FV<SZ>(), mem); @@ -169,14 +169,12 @@ struct DCT1DImpl<2, SZ> { template <size_t N, size_t SZ> struct DCT1DImpl { - void operator()(float* JXL_RESTRICT mem) { - // This is relatively small (4kB with 64-DCT and AVX-512) - HWY_ALIGN float tmp[N * SZ]; + void operator()(float* JXL_RESTRICT mem, float* JXL_RESTRICT tmp) { CoeffBundle<N / 2, SZ>::AddReverse(mem, mem + N / 2 * SZ, tmp); - DCT1DImpl<N / 2, SZ>()(tmp); + DCT1DImpl<N / 2, SZ>()(tmp, tmp + N * SZ); CoeffBundle<N / 2, SZ>::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ); CoeffBundle<N, SZ>::Multiply(tmp); - DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ); + DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, tmp + N * SZ); CoeffBundle<N / 2, SZ>::B(tmp + N / 2 * SZ); CoeffBundle<N, SZ>::InverseEvenOdd(tmp, mem); } @@ -188,7 +186,7 @@ struct IDCT1DImpl; template <size_t SZ> struct IDCT1DImpl<1, SZ> { JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, - size_t to_stride) { + size_t to_stride, float* JXL_RESTRICT) { StoreU(LoadU(FV<SZ>(), from), FV<SZ>(), to); } }; @@ -196,7 +194,7 @@ struct IDCT1DImpl<1, SZ> { template <size_t SZ> struct IDCT1DImpl<2, SZ> { JXL_INLINE void operator()(const float* from, size_t from_stride, float* to, - size_t to_stride) { + size_t to_stride, float* JXL_RESTRICT) { JXL_DASSERT(from_stride >= SZ); JXL_DASSERT(to_stride >= SZ); auto in1 = LoadU(FV<SZ>(), from); @@ -209,74 +207,79 @@ struct IDCT1DImpl<2, SZ> { template <size_t N, size_t SZ> struct IDCT1DImpl { void operator()(const float* from, size_t from_stride, float* to, - size_t to_stride) { + size_t to_stride, float* JXL_RESTRICT tmp) { JXL_DASSERT(from_stride >= SZ); JXL_DASSERT(to_stride >= SZ); - // This is relatively small (4kB with 64-DCT and AVX-512) - HWY_ALIGN float tmp[N * SZ]; CoeffBundle<N, SZ>::ForwardEvenOdd(from, from_stride, tmp); - IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ); + IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ, tmp + N * SZ); CoeffBundle<N / 2, SZ>::BTranspose(tmp + N / 2 * SZ); - IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ); + IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ, + tmp + N * SZ); CoeffBundle<N, SZ>::MultiplyAndAdd(tmp, to, to_stride); } }; template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock> -void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { +void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp, + float* JXL_RESTRICT tmp) { size_t M = M_or_0 != 0 ? M_or_0 : Mp; constexpr size_t SZ = MaxLanes(FV<M_or_0>()); - HWY_ALIGN float tmp[N * SZ]; for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) { // TODO(veluca): consider removing the temporary memory here (as is done in // IDCT), if it turns out that some compilers don't optimize away the loads // and this is performance-critical. CoeffBundle<N, SZ>::LoadFromBlock(from, i, tmp); - DCT1DImpl<N, SZ>()(tmp); + DCT1DImpl<N, SZ>()(tmp, tmp + N * SZ); CoeffBundle<N, SZ>::StoreToBlockAndScale(tmp, to, i); } } template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock> -void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) { +void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp, + float* JXL_RESTRICT tmp) { size_t M = M_or_0 != 0 ? M_or_0 : Mp; constexpr size_t SZ = MaxLanes(FV<M_or_0>()); for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) { IDCT1DImpl<N, SZ>()(from.Address(0, i), from.Stride(), to.Address(0, i), - to.Stride()); + to.Stride(), tmp); } } template <size_t N, size_t M, typename = void> struct DCT1D { template <typename FromBlock, typename ToBlock> - void operator()(const FromBlock& from, const ToBlock& to) { - return DCT1DWrapper<N, M>(from, to, M); + void operator()(const FromBlock& from, const ToBlock& to, + float* JXL_RESTRICT tmp) { + return DCT1DWrapper<N, M>(from, to, M, tmp); } }; template <size_t N, size_t M> struct DCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> { template <typename FromBlock, typename ToBlock> - void operator()(const FromBlock& from, const ToBlock& to) { - return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M); + void operator()(const FromBlock& from, const ToBlock& to, + float* JXL_RESTRICT tmp) { + return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M, + tmp); } }; template <size_t N, size_t M, typename = void> struct IDCT1D { template <typename FromBlock, typename ToBlock> - void operator()(const FromBlock& from, const ToBlock& to) { - return IDCT1DWrapper<N, M>(from, to, M); + void operator()(const FromBlock& from, const ToBlock& to, + float* JXL_RESTRICT tmp) { + return IDCT1DWrapper<N, M>(from, to, M, tmp); } }; template <size_t N, size_t M> struct IDCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> { template <typename FromBlock, typename ToBlock> - void operator()(const FromBlock& from, const ToBlock& to) { - return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, - M); + void operator()(const FromBlock& from, const ToBlock& to, + float* JXL_RESTRICT tmp) { + return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M, + tmp); } }; @@ -290,15 +293,16 @@ struct ComputeScaledDCT { HWY_MAYBE_UNUSED void operator()(const From& from, float* to, float* JXL_RESTRICT scratch_space) { float* JXL_RESTRICT block = scratch_space; + float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS; if (ROWS < COLS) { - DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS)); + DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS), tmp); Transpose<ROWS, COLS>::Run(DCTFrom(block, COLS), DCTTo(to, ROWS)); - DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS)); + DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS), tmp); Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(to, COLS)); } else { - DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS)); + DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS), tmp); Transpose<ROWS, COLS>::Run(DCTFrom(to, COLS), DCTTo(block, ROWS)); - DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS)); + DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS), tmp); } } }; @@ -312,16 +316,17 @@ struct ComputeScaledIDCT { HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to, float* JXL_RESTRICT scratch_space) { float* JXL_RESTRICT block = scratch_space; + float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS; // Reverse the steps done in ComputeScaledDCT. if (ROWS < COLS) { Transpose<ROWS, COLS>::Run(DCTFrom(from, COLS), DCTTo(block, ROWS)); - IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS)); + IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS), tmp); Transpose<COLS, ROWS>::Run(DCTFrom(from, ROWS), DCTTo(block, COLS)); - IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to); + IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to, tmp); } else { - IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS)); + IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS), tmp); Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(from, COLS)); - IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to); + IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to, tmp); } } }; |