diff options
author | jiyong.min <jiyong.min@samsung.com> | 2022-12-02 12:23:48 +0900 |
---|---|---|
committer | jiyong.min <jiyong.min@samsung.com> | 2022-12-02 12:25:00 +0900 |
commit | 7fa2aaed0a5c855460b77fb1fedcc01591ff6470 (patch) | |
tree | d06014f551165519387ed1d3a2645ad24b90c3f1 /lib/jxl/convolve-inl.h | |
parent | b8aee4491f6498d0cc17d18c80f6b3505e2a62ef (diff) | |
download | libjxl-7fa2aaed0a5c855460b77fb1fedcc01591ff6470.tar.gz libjxl-7fa2aaed0a5c855460b77fb1fedcc01591ff6470.tar.bz2 libjxl-7fa2aaed0a5c855460b77fb1fedcc01591ff6470.zip |
Imported Upstream version 0.7.0upstream/0.7.0
Change-Id: Ice1364b8cdf314cd19cc443865a3c02b7b0c1d5c
Diffstat (limited to 'lib/jxl/convolve-inl.h')
-rw-r--r-- | lib/jxl/convolve-inl.h | 184 |
1 files changed, 181 insertions, 3 deletions
diff --git a/lib/jxl/convolve-inl.h b/lib/jxl/convolve-inl.h index 255bb9d..054c9c6 100644 --- a/lib/jxl/convolve-inl.h +++ b/lib/jxl/convolve-inl.h @@ -12,7 +12,10 @@ #include <hwy/highway.h> +#include "lib/jxl/base/profiler.h" #include "lib/jxl/base/status.h" +#include "lib/jxl/image_ops.h" + HWY_BEFORE_NAMESPACE(); namespace jxl { namespace HWY_NAMESPACE { @@ -23,6 +26,7 @@ using hwy::HWY_NAMESPACE::Broadcast; #if HWY_TARGET != HWY_SCALAR using hwy::HWY_NAMESPACE::CombineShiftRightBytes; #endif +using hwy::HWY_NAMESPACE::TableLookupLanes; using hwy::HWY_NAMESPACE::Vec; // Synthesizes left/right neighbors from a vector of center pixels. @@ -44,7 +48,7 @@ class Neighbors { return c; // Same (the first mirrored value is the last valid one) #else // 128 bit // c = LKJI -#if HWY_ARCH_X86 +#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86) return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(2, 1, 0, 0))}; // KJII #else const D d; @@ -72,7 +76,7 @@ class Neighbors { return Zero(d); #else // 128 bit // c = LKJI -#if HWY_ARCH_X86 +#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86) return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(1, 0, 0, 1))}; // JIIJ #else const D d; @@ -98,7 +102,7 @@ class Neighbors { return Zero(d); #else // 128 bit // c = LKJI -#if HWY_ARCH_X86 +#if HWY_TARGET <= (1 << HWY_HIGHEST_TARGET_BIT_X86) return V{_mm_shuffle_ps(c.raw, c.raw, _MM_SHUFFLE(0, 0, 1, 2))}; // IIJK #else const D d; @@ -110,6 +114,180 @@ class Neighbors { } }; +#if HWY_TARGET != HWY_SCALAR + +// Returns indices for SetTableIndices such that TableLookupLanes on the +// rightmost unaligned vector (rightmost sample in its most-significant lane) +// returns the mirrored values, with the mirror outside the last valid sample. +static inline const int32_t* MirrorLanes(const size_t mod) { + const HWY_CAPPED(float, 16) d; + constexpr size_t kN = MaxLanes(d); + + // For mod = `image width mod 16` 0..15: + // last full vec mirrored (mem order) loadedVec mirrorVec idxVec + // 0123456789abcdef| fedcba9876543210 fed..210 012..def 012..def + // 0123456789abcdef|0 0fedcba98765432 0fe..321 234..f00 123..eff + // 0123456789abcdef|01 10fedcba987654 10f..432 456..110 234..ffe + // 0123456789abcdef|012 210fedcba9876 210..543 67..2210 34..ffed + // 0123456789abcdef|0123 3210fedcba98 321..654 8..33210 4..ffedc + // 0123456789abcdef|01234 43210fedcba + // 0123456789abcdef|012345 543210fedc + // 0123456789abcdef|0123456 6543210fe + // 0123456789abcdef|01234567 76543210 + // 0123456789abcdef|012345678 8765432 + // 0123456789abcdef|0123456789 987654 + // 0123456789abcdef|0123456789A A9876 + // 0123456789abcdef|0123456789AB BA98 + // 0123456789abcdef|0123456789ABC CBA + // 0123456789abcdef|0123456789ABCD DC + // 0123456789abcdef|0123456789ABCDE E EDC..10f EED..210 ffe..321 +#if HWY_CAP_GE512 + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, // + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; +#elif HWY_CAP_GE256 + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = { + 1, 2, 3, 4, 5, 6, 7, 7, // + 6, 5, 4, 3, 2, 1, 0}; +#else // 128-bit + HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3, // + 2, 1, 0}; +#endif + return idx_lanes + kN - 1 - mod; +} + +#endif // HWY_TARGET != HWY_SCALAR + +// Single entry point for convolution. +// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it. +template <class Strategy> +class ConvolveT { + static constexpr int64_t kRadius = Strategy::kRadius; + using Simd = HWY_CAPPED(float, 16); + + public: + static size_t MinWidth() { +#if HWY_TARGET == HWY_SCALAR + // First/Last use mirrored loads of up to +/- kRadius. + return 2 * kRadius; +#else + return Lanes(Simd()) + kRadius; +#endif + } + + // "Image" is ImageF or Image3F. + template <class Image, class Weights> + static void Run(const Image& in, const Rect& rect, const Weights& weights, + ThreadPool* pool, Image* out) { + PROFILER_ZONE("ConvolveT::Run"); + JXL_CHECK(SameSize(rect, *out)); + JXL_CHECK(rect.xsize() >= MinWidth()); + + static_assert(int64_t(kRadius) <= 3, + "Must handle [0, kRadius) and >= kRadius"); + switch (rect.xsize() % Lanes(Simd())) { + case 0: + return RunRows<0>(in, rect, weights, pool, out); + case 1: + return RunRows<1>(in, rect, weights, pool, out); + case 2: + return RunRows<2>(in, rect, weights, pool, out); + default: + return RunRows<3>(in, rect, weights, pool, out); + } + } + + private: + template <size_t kSizeModN, class WrapRow, class Weights> + static JXL_INLINE void RunRow(const float* JXL_RESTRICT in, + const size_t xsize, const int64_t stride, + const WrapRow& wrap_row, const Weights& weights, + float* JXL_RESTRICT out) { + Strategy::template ConvolveRow<kSizeModN>(in, xsize, stride, wrap_row, + weights, out); + } + + template <size_t kSizeModN, class Weights> + static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect, + const int64_t ybegin, const int64_t yend, + const Weights& weights, ImageF* out) { + const int64_t stride = in.PixelsPerRow(); + const WrapRowMirror wrap_row(in, rect.ysize()); + for (int64_t y = ybegin; y < yend; ++y) { + RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row, + weights, out->Row(y)); + } + } + + // Image3F. + template <size_t kSizeModN, class Weights> + static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect, + const int64_t ybegin, const int64_t yend, + const Weights& weights, Image3F* out) { + const int64_t stride = in.PixelsPerRow(); + for (int64_t y = ybegin; y < yend; ++y) { + for (size_t c = 0; c < 3; ++c) { + const WrapRowMirror wrap_row(in.Plane(c), rect.ysize()); + RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride, + wrap_row, weights, out->PlaneRow(c, y)); + } + } + } + + template <size_t kSizeModN, class Weights> + static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect, + const int64_t ybegin, + const int64_t yend, + const Weights& weights, + ThreadPool* pool, ImageF* out) { + const int64_t stride = in.PixelsPerRow(); + JXL_CHECK(RunOnPool( + pool, ybegin, yend, ThreadPool::NoInit, + [&](const uint32_t y, size_t /*thread*/) HWY_ATTR { + RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, + WrapRowUnchanged(), weights, out->Row(y)); + }, + "Convolve")); + } + + // Image3F. + template <size_t kSizeModN, class Weights> + static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect, + const int64_t ybegin, + const int64_t yend, + const Weights& weights, + ThreadPool* pool, Image3F* out) { + const int64_t stride = in.PixelsPerRow(); + JXL_CHECK(RunOnPool( + pool, ybegin, yend, ThreadPool::NoInit, + [&](const uint32_t y, size_t /*thread*/) HWY_ATTR { + for (size_t c = 0; c < 3; ++c) { + RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), + stride, WrapRowUnchanged(), weights, + out->PlaneRow(c, y)); + } + }, + "Convolve3")); + } + + template <size_t kSizeModN, class Image, class Weights> + static JXL_INLINE void RunRows(const Image& in, const Rect& rect, + const Weights& weights, ThreadPool* pool, + Image* out) { + const int64_t ysize = rect.ysize(); + RunBorderRows<kSizeModN>(in, rect, 0, std::min(int64_t(kRadius), ysize), + weights, out); + if (ysize > 2 * int64_t(kRadius)) { + RunInteriorRows<kSizeModN>(in, rect, int64_t(kRadius), + ysize - int64_t(kRadius), weights, pool, out); + } + if (ysize > int64_t(kRadius)) { + RunBorderRows<kSizeModN>(in, rect, ysize - int64_t(kRadius), ysize, + weights, out); + } + } +}; + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE |