1 files changed, 258 insertions, 243 deletions
diff --git a/lib/jxl/enc_frame.cc b/lib/jxl/enc_frame.cc
index f57175b..cfd97e9 100644
--- a/lib/jxl/enc_frame.cc
+++ b/lib/jxl/enc_frame.cc
@@ -19,31 +19,30 @@
 #include "lib/jxl/ac_context.h"
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/compressed_dc.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/enc_adaptive_quantization.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_chroma_from_luma.h"
 #include "lib/jxl/enc_coeff_order.h"
 #include "lib/jxl/enc_context_map.h"
 #include "lib/jxl/enc_entropy_coder.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
 #include "lib/jxl/enc_group.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_noise.h"
@@ -55,12 +54,13 @@
 #include "lib/jxl/enc_toc.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
-#include "lib/jxl/gaborish.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/loop_filter.h"
+#include "lib/jxl/modular/options.h"
 #include "lib/jxl/quant_weights.h"
 #include "lib/jxl/quantizer.h"
 #include "lib/jxl/splines.h"
@@ -69,140 +69,21 @@
 namespace jxl {
 namespace {
 
-void ClusterGroups(PassesEncoderState* enc_state) {
-  if (enc_state->shared.frame_header.passes.num_passes > 1) {
-    // TODO(veluca): implement this for progressive modes.
-    return;
-  }
-  // This only considers pass 0 for now.
-  std::vector<uint8_t> context_map;
-  EntropyEncodingData codes;
-  auto& ac = enc_state->passes[0].ac_tokens;
-  size_t limit = std::ceil(std::sqrt(ac.size()));
-  if (limit == 1) return;
-  size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts();
-  std::vector<float> costs(ac.size());
-  HistogramParams params;
-  params.uint_method = HistogramParams::HybridUintMethod::kNone;
-  params.lz77_method = HistogramParams::LZ77Method::kNone;
-  params.ans_histogram_strategy =
-      HistogramParams::ANSHistogramStrategy::kApproximate;
-  size_t max = 0;
-  auto token_cost = [&](std::vector<std::vector<Token>>& tokens, size_t num_ctx,
-                        bool estimate = true) {
-    // TODO(veluca): not estimating is very expensive.
-    BitWriter writer;
-    size_t c = BuildAndEncodeHistograms(
-        params, num_ctx, tokens, &codes, &context_map,
-        estimate ? nullptr : &writer, 0, /*aux_out=*/0);
-    if (estimate) return c;
-    for (size_t i = 0; i < tokens.size(); i++) {
-      WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr);
-    }
-    return writer.BitsWritten();
-  };
-  for (size_t i = 0; i < ac.size(); i++) {
-    std::vector<std::vector<Token>> tokens{ac[i]};
-    costs[i] =
-        token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts());
-    if (costs[i] > costs[max]) {
-      max = i;
-    }
-  }
-  auto dist = [&](int i, int j) {
-    std::vector<std::vector<Token>> tokens{ac[i], ac[j]};
-    return token_cost(tokens, num_contexts) - costs[i] - costs[j];
-  };
-  std::vector<size_t> out{max};
-  std::vector<float> dists(ac.size());
-  size_t farthest = 0;
-  for (size_t i = 0; i < ac.size(); i++) {
-    if (i == max) continue;
-    dists[i] = dist(max, i);
-    if (dists[i] > dists[farthest]) {
-      farthest = i;
-    }
-  }
-
-  while (dists[farthest] > 0 && out.size() < limit) {
-    out.push_back(farthest);
-    dists[farthest] = 0;
-    enc_state->histogram_idx[farthest] = out.size() - 1;
-    for (size_t i = 0; i < ac.size(); i++) {
-      float d = dist(out.back(), i);
-      if (d < dists[i]) {
-        dists[i] = d;
-        enc_state->histogram_idx[i] = out.size() - 1;
-      }
-      if (dists[i] > dists[farthest]) {
-        farthest = i;
-      }
-    }
-  }
-
-  std::vector<size_t> remap(out.size());
-  std::iota(remap.begin(), remap.end(), 0);
-  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
-    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
-  }
-  auto remap_cost = [&](std::vector<size_t> remap) {
-    std::vector<size_t> re_remap(remap.size(), remap.size());
-    size_t r = 0;
-    for (size_t i = 0; i < remap.size(); i++) {
-      if (re_remap[remap[i]] == remap.size()) {
-        re_remap[remap[i]] = r++;
-      }
-      remap[i] = re_remap[remap[i]];
-    }
-    auto tokens = ac;
-    size_t max_hist = 0;
-    for (size_t i = 0; i < tokens.size(); i++) {
-      for (size_t j = 0; j < tokens[i].size(); j++) {
-        size_t hist = remap[enc_state->histogram_idx[i]];
-        tokens[i][j].context += hist * num_contexts;
-        max_hist = std::max(hist + 1, max_hist);
-      }
-    }
-    return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false);
-  };
-
-  for (size_t src = 0; src < out.size(); src++) {
-    float cost = remap_cost(remap);
-    size_t best = src;
-    for (size_t j = src + 1; j < out.size(); j++) {
-      if (remap[src] == remap[j]) continue;
-      auto remap_c = remap;
-      std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]);
-      float c = remap_cost(remap_c);
-      if (c < cost) {
-        best = j;
-        cost = c;
-      }
-    }
-    if (src != best) {
-      std::replace(remap.begin(), remap.end(), remap[src], remap[best]);
-    }
-  }
-  std::vector<size_t> re_remap(remap.size(), remap.size());
-  size_t r = 0;
-  for (size_t i = 0; i < remap.size(); i++) {
-    if (re_remap[remap[i]] == remap.size()) {
-      re_remap[remap[i]] = r++;
-    }
-    remap[i] = re_remap[remap[i]];
-  }
+PassDefinition progressive_passes_dc_vlf_lf_full_ac[] = {
+    {/*num_coefficients=*/2, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/4},
+    {/*num_coefficients=*/3, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
 
-  enc_state->shared.num_histograms =
-      *std::max_element(remap.begin(), remap.end()) + 1;
-  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
-    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
-  }
-  for (size_t i = 0; i < ac.size(); i++) {
-    for (size_t j = 0; j < ac[i].size(); j++) {
-      ac[i][j].context += enc_state->histogram_idx[i] * num_contexts;
-    }
-  }
-}
+PassDefinition progressive_passes_dc_quant_ac_full_ac[] = {
+    {/*num_coefficients=*/8, /*shift=*/1,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
 
 uint64_t FrameFlagsFromParams(const CompressParams& cparams) {
   uint64_t flags = 0;
@@ -281,7 +162,17 @@ Status MakeFrameHeader(const CompressParams& cparams,
 
   if (cparams.modular_mode) {
     frame_header->encoding = FrameEncoding::kModular;
-    frame_header->group_size_shift = cparams.modular_group_size_shift;
+    if (cparams.modular_group_size_shift == -1) {
+      frame_header->group_size_shift = 1;
+      // no point using groups when only one group is full and the others are
+      // less than half full: multithreading will not really help much, while
+      // compression does suffer
+      if (ib.xsize() <= 400 && ib.ysize() <= 400) {
+        frame_header->group_size_shift = 2;
+      }
+    } else {
+      frame_header->group_size_shift = cparams.modular_group_size_shift;
+    }
   }
 
   frame_header->chroma_subsampling = ib.chroma_subsampling;
@@ -299,6 +190,13 @@ Status MakeFrameHeader(const CompressParams& cparams,
           "recompressing JPEGs");
     }
   }
+  if (frame_header->color_transform != ColorTransform::kYCbCr &&
+      (frame_header->chroma_subsampling.MaxHShift() != 0 ||
+       frame_header->chroma_subsampling.MaxVShift() != 0)) {
+    return JXL_FAILURE(
+        "Chroma subsampling is not supported when color transform is not "
+        "YCbCr");
+  }
 
   frame_header->flags = FrameFlagsFromParams(cparams);
   // Non-photon noise is not supported in the Modular encoder for now.
@@ -477,6 +375,78 @@ void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) {
   }
 }
 
+struct PixelStatsForChromacityAdjustment {
+  float dx = 0;
+  float db = 0;
+  float exposed_blue = 0;
+  float CalcPlane(const ImageF* JXL_RESTRICT plane) const {
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane->xsize(); ++tx) {
+        float cur = plane->Row(ty)[tx];
+        float prev_row = plane->Row(ty - 1)[tx];
+        float prev = plane->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(cur - prev));
+        ymax = std::max(ymax, std::abs(cur - prev_row));
+      }
+    }
+    return std::max(xmax, ymax);
+  }
+  void CalcExposedBlue(const ImageF* JXL_RESTRICT plane_y,
+                       const ImageF* JXL_RESTRICT plane_b) {
+    float eb = 0;
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane_y->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane_y->xsize(); ++tx) {
+        float cur_y = plane_y->Row(ty)[tx];
+        float cur_b = plane_b->Row(ty)[tx];
+        float exposed_b = cur_b - cur_y * 1.2;
+        float diff_b = cur_b - cur_y;
+        float prev_row = plane_b->Row(ty - 1)[tx];
+        float prev = plane_b->Row(ty)[tx - 1];
+        float diff_prev_row = prev_row - plane_y->Row(ty - 1)[tx];
+        float diff_prev = prev - plane_y->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(diff_b - diff_prev));
+        ymax = std::max(ymax, std::abs(diff_b - diff_prev_row));
+        if (exposed_b >= 0) {
+          exposed_b *= fabs(cur_b - prev) + fabs(cur_b - prev_row);
+          eb = std::max(eb, exposed_b);
+        }
+      }
+    }
+    exposed_blue = eb;
+    db = std::max(xmax, ymax);
+  }
+  void Calc(const Image3F* JXL_RESTRICT opsin) {
+    dx = CalcPlane(&opsin->Plane(0));
+    CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2));
+  }
+  int HowMuchIsXChannelPixelized() {
+    if (dx >= 0.03) {
+      return 2;
+    }
+    if (dx >= 0.017) {
+      return 1;
+    }
+    return 0;
+  }
+  int HowMuchIsBChannelPixelized() {
+    int add = exposed_blue >= 0.13 ? 1 : 0;
+    if (db > 0.38) {
+      return 2 + add;
+    }
+    if (db > 0.33) {
+      return 1 + add;
+    }
+    if (db > 0.28) {
+      return add;
+    }
+    return 0;
+  }
+};
+
 }  // namespace
 
 class LossyFrameEncoder {
@@ -498,16 +468,18 @@ class LossyFrameEncoder {
                              const JxlCmsInterface& cms, ThreadPool* pool,
                              ModularFrameEncoder* modular_frame_encoder,
                              FrameHeader* frame_header) {
-    PROFILER_ZONE("ComputeEncodingData uninstrumented");
     JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 &&
                (opsin->ysize() % kBlockDim) == 0);
     PassesSharedState& shared = enc_state_->shared;
 
     if (!enc_state_->cparams.max_error_mode) {
-      float x_qm_scale_steps[2] = {1.25f, 9.0f};
+      // Compute chromacity adjustments using two approaches.
+      // 1) Distance based approach for chromacity adjustment:
+      float x_qm_scale_steps[4] = {1.25f, 7.0f, 15.0f, 24.0f};
       shared.frame_header.x_qm_scale = 2;
       for (float x_qm_scale_step : x_qm_scale_steps) {
-        if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) {
+        if (enc_state_->cparams.original_butteraugli_distance >
+            x_qm_scale_step) {
           shared.frame_header.x_qm_scale++;
         }
       }
@@ -516,6 +488,20 @@ class LossyFrameEncoder {
         // faithful to original even with extreme (5-10x) zooming.
         shared.frame_header.x_qm_scale++;
       }
+      // 2) Pixel-based approach for chromacity adjustment:
+      // look at the individual pixels and make a guess how difficult
+      // the image would be based on the worst case pixel.
+      PixelStatsForChromacityAdjustment pixel_stats;
+      if (enc_state_->cparams.speed_tier <= SpeedTier::kWombat) {
+        pixel_stats.Calc(opsin);
+      }
+      // For X take the most severe adjustment.
+      shared.frame_header.x_qm_scale =
+          std::max<int>(shared.frame_header.x_qm_scale,
+                        2 + pixel_stats.HowMuchIsXChannelPixelized());
+      // B only adjusted by pixel-based approach.
+      shared.frame_header.b_qm_scale =
+          2 + pixel_stats.HowMuchIsBChannelPixelized();
     }
 
     JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics(
@@ -571,7 +557,6 @@ class LossyFrameEncoder {
   Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data,
                                     ModularFrameEncoder* modular_frame_encoder,
                                     FrameHeader* frame_header) {
-    PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented");
     PassesSharedState& shared = enc_state_->shared;
 
     frame_header->x_qm_scale = 2;
@@ -589,9 +574,16 @@ class LossyFrameEncoder {
     shared.ac_strategy.FillDCT8();
     FillImage(uint8_t(0), &shared.epf_sharpness);
 
+    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
+    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
+      pass.ac_tokens.resize(shared.frame_dim.num_groups);
+    }
+
     enc_state_->coeffs.clear();
-    enc_state_->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
-        kGroupDim * kGroupDim, frame_dim.num_groups));
+    while (enc_state_->coeffs.size() < enc_state_->passes.size()) {
+      enc_state_->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
+          kGroupDim * kGroupDim, frame_dim.num_groups));
+    }
 
     // convert JPEG quantization table to a Quantizer object
     float dcquantization[3];
@@ -735,9 +727,12 @@ class LossyFrameEncoder {
                                       "FindCorrelation"));
       }
     }
+
     if (!frame_header->chroma_subsampling.Is444()) {
       ZeroFillImage(&dc);
-      enc_state_->coeffs[0]->ZeroFill();
+      for (auto& coeff : enc_state_->coeffs) {
+        coeff->ZeroFill();
+      }
     }
     // JPEG DC is from -1024 to 1023.
     std::vector<size_t> dc_counts[3] = {};
@@ -747,7 +742,9 @@ class LossyFrameEncoder {
     size_t total_dc[3] = {};
     for (size_t c : {1, 0, 2}) {
       if (jpeg_data.components.size() == 1 && c != 1) {
-        enc_state_->coeffs[0]->ZeroFillPlane(c);
+        for (auto& coeff : enc_state_->coeffs) {
+          coeff->ZeroFillPlane(c);
+        }
         ZeroFillImage(&dc.Plane(c));
         // Ensure no division by 0.
         dc_counts[c][1024] = 1;
@@ -761,9 +758,11 @@ class LossyFrameEncoder {
            group_index++) {
         const size_t gx = group_index % frame_dim.xsize_groups;
         const size_t gy = group_index / frame_dim.xsize_groups;
-        size_t offset = 0;
-        int32_t* JXL_RESTRICT ac =
-            enc_state_->coeffs[0]->PlaneRow(c, group_index, 0).ptr32;
+        int32_t* coeffs[kMaxNumPasses];
+        for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+          coeffs[i] = enc_state_->coeffs[i]->PlaneRow(c, group_index, 0).ptr32;
+        }
+        int32_t block[64];
         for (size_t by = gy * kGroupDimInBlocks;
              by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) {
           if ((by >> vshift) << vshift != by) continue;
@@ -790,7 +789,7 @@ class LossyFrameEncoder {
                 !frame_header->chroma_subsampling.Is444()) {
               for (size_t y = 0; y < 8; y++) {
                 for (size_t x = 0; x < 8; x++) {
-                  ac[offset + y * 8 + x] = inputjpeg[base + x * 8 + y];
+                  block[y * 8 + x] = inputjpeg[base + x * 8 + y];
                 }
               }
             } else {
@@ -810,11 +809,16 @@ class LossyFrameEncoder {
                                     (1 << (kCFLFixedPointPrecision - 1))) >>
                                    kCFLFixedPointPrecision;
                   int QCR = QChroma - cfl_factor;
-                  ac[offset + y * 8 + x] = QCR;
+                  block[y * 8 + x] = QCR;
                 }
               }
             }
-            offset += 64;
+            enc_state_->progressive_splitter.SplitACCoefficients(
+                block, AcStrategy::FromRawStrategy(AcStrategy::Type::DCT), bx,
+                by, coeffs);
+            for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+              coeffs[i] += kDCTBlockSize;
+            }
           }
         }
       }
@@ -875,14 +879,6 @@ class LossyFrameEncoder {
     // Must happen before WriteFrameHeader!
     shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing);
 
-    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
-    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
-      pass.ac_tokens.resize(shared.frame_dim.num_groups);
-    }
-
-    JXL_CHECK(enc_state_->passes.size() ==
-              1);  // skipping coeff splitting so need to have only one pass
-
     ComputeAllCoeffOrders(frame_dim);
     shared.num_histograms = 1;
 
@@ -924,8 +920,9 @@ class LossyFrameEncoder {
   Status EncodeGlobalDCInfo(const FrameHeader& frame_header,
                             BitWriter* writer) const {
     // Encode quantizer DC and global scale.
+    QuantizerParams params = enc_state_->shared.quantizer.GetParams();
     JXL_RETURN_IF_ERROR(
-        enc_state_->shared.quantizer.Encode(writer, kLayerQuant, aux_out_));
+        WriteQuantizerParams(params, writer, kLayerQuant, aux_out_));
     EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_);
     ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC,
                                 aux_out_);
@@ -937,15 +934,12 @@ class LossyFrameEncoder {
     JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices,
                                               writer, kLayerQuant, aux_out_,
                                               modular_frame_encoder));
-    if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) {
-      if (!doing_jpeg_recompression) ClusterGroups(enc_state_);
-    }
     size_t num_histo_bits =
         CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups);
     if (num_histo_bits != 0) {
       BitWriter::Allotment allotment(writer, num_histo_bits);
       writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1);
-      ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out_);
+      allotment.ReclaimAndCharge(writer, kLayerAC, aux_out_);
     }
 
     for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses();
@@ -956,7 +950,7 @@ class LossyFrameEncoder {
           kOrderEnc, enc_state_->used_orders[i], &order_bits));
       BitWriter::Allotment allotment(writer, order_bits);
       JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer));
-      ReclaimAndCharge(writer, &allotment, kLayerOrder, aux_out_);
+      allotment.ReclaimAndCharge(writer, kLayerOrder, aux_out_);
       EncodeCoeffOrders(
           enc_state_->used_orders[i],
           &enc_state_->shared
@@ -995,7 +989,6 @@ class LossyFrameEncoder {
 
  private:
   void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) {
-    PROFILER_FUNC;
     // No coefficient reordering in Falcon or faster.
     auto used_orders_info = ComputeUsedOrders(
         enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy,
@@ -1077,56 +1070,90 @@ Status EncodeFrame(const CompressParams& cparams_orig,
                    const JxlCmsInterface& cms, ThreadPool* pool,
                    BitWriter* writer, AuxOut* aux_out) {
   CompressParams cparams = cparams_orig;
-  if (cparams_orig.target_bitrate > 0.0f &&
-      frame_info.frame_type == FrameType::kRegularFrame) {
-    cparams.target_bitrate = 0.0f;
-    const float target_bitrate = cparams_orig.target_bitrate;
-    float bitrate = 0.0f;
-    float prev_bitrate = 0.0f;
-    float rescale = 1.0f;
-    size_t prev_bits = 0;
-    float error = 0.0f;
-    float best_error = 100.0f;
-    float best_rescale = 1.0f;
-    for (size_t i = 0; i < 10; ++i) {
-      std::unique_ptr<PassesEncoderState> state =
-          jxl::make_unique<PassesEncoderState>();
-      BitWriter bw;
-      JXL_CHECK(EncodeFrame(cparams, frame_info, metadata, ib, state.get(), cms,
-                            pool, &bw, nullptr));
-      bitrate = bw.BitsWritten() * 1.0 / (ib.xsize() * ib.ysize());
-      error = target_bitrate / bitrate - 1.0f;
-      if (std::abs(error) < std::abs(best_error)) {
-        best_error = error;
-        best_rescale = cparams.quant_ac_rescale;
-      }
-      if (bw.BitsWritten() == prev_bits || std::abs(error) < 0.0005f) {
-        break;
-      }
-      float lambda = 1.0f;
-      if (i > 0) {
-        lambda = (((bitrate / prev_bitrate) - 1.0f) / (rescale - 1.0f));
-      }
-      rescale = (1.0f + ((target_bitrate / bitrate) - 1.0f) / lambda);
-      if (rescale < 0.0f) {
-        break;
+  if (cparams.speed_tier == SpeedTier::kGlacier && !cparams.IsLossless()) {
+    cparams.speed_tier = SpeedTier::kTortoise;
+  }
+  if (cparams.speed_tier == SpeedTier::kGlacier) {
+    std::vector<CompressParams> all_params;
+    std::vector<size_t> size;
+
+    CompressParams cparams_attempt = cparams_orig;
+    cparams_attempt.speed_tier = SpeedTier::kTortoise;
+    cparams_attempt.options.max_properties = 4;
+
+    for (float x : {0.0f, 80.f}) {
+      cparams_attempt.channel_colors_percent = x;
+      for (float y : {0.0f, 95.0f}) {
+        cparams_attempt.channel_colors_pre_transform_percent = y;
+        // 70000 ensures that the number of palette colors is representable in
+        // modular headers.
+        for (int K : {0, 1 << 10, 70000}) {
+          cparams_attempt.palette_colors = K;
+          for (int tree_mode : {-1, (int)ModularOptions::TreeMode::kNoWP,
+                                (int)ModularOptions::TreeMode::kDefault}) {
+            if (tree_mode == -1) {
+              // LZ77 only
+              cparams_attempt.options.nb_repeats = 0;
+            } else {
+              cparams_attempt.options.nb_repeats = 1;
+              cparams_attempt.options.wp_tree_mode =
+                  static_cast<ModularOptions::TreeMode>(tree_mode);
+            }
+            for (Predictor pred : {Predictor::Zero, Predictor::Variable}) {
+              cparams_attempt.options.predictor = pred;
+              for (int g : {0, -1, 3}) {
+                cparams_attempt.modular_group_size_shift = g;
+                for (Override patches : {Override::kDefault, Override::kOff}) {
+                  cparams_attempt.patches = patches;
+                  all_params.push_back(cparams_attempt);
+                }
+              }
+            }
+          }
+        }
       }
-      cparams.quant_ac_rescale *= rescale;
-      prev_bitrate = bitrate;
-      prev_bits = bw.BitsWritten();
     }
-    if (aux_out) {
-      aux_out->max_quant_rescale = best_rescale;
-      aux_out->min_quant_rescale = best_rescale;
-      aux_out->min_bitrate_error = best_error;
-      aux_out->max_bitrate_error = best_error;
+
+    size.resize(all_params.size());
+
+    std::atomic<int> num_errors{0};
+
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, all_params.size(), ThreadPool::NoInit,
+        [&](size_t task, size_t) {
+          BitWriter w;
+          PassesEncoderState state;
+          if (!EncodeFrame(all_params[task], frame_info, metadata, ib, &state,
+                           cms, nullptr, &w, aux_out)) {
+            num_errors.fetch_add(1, std::memory_order_relaxed);
+            return;
+          }
+          size[task] = w.BitsWritten();
+        },
+        "Compress kGlacier"));
+    JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0);
+
+    size_t best_idx = 0;
+    for (size_t i = 1; i < all_params.size(); i++) {
+      if (size[best_idx] > size[i]) {
+        best_idx = i;
+      }
     }
-    cparams.quant_ac_rescale = best_rescale;
+    cparams = all_params[best_idx];
   }
+
   ib.VerifyMetadata();
 
   passes_enc_state->special_frames.clear();
 
+  if (cparams.qprogressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_quant_ac_full_ac});
+  } else if (cparams.progressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_vlf_lf_full_ac});
+  }
+
   JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
 
   if (cparams.progressive_dc < 0) {
@@ -1135,13 +1162,6 @@ Status EncodeFrame(const CompressParams& cparams_orig,
                          cparams.progressive_dc);
     }
     cparams.progressive_dc = 0;
-    // Enable progressive_dc for lower qualities, except for fast speeds where
-    // the modular encoder uses fixed tree.
-    if (cparams.speed_tier <= SpeedTier::kCheetah &&
-        cparams.butteraugli_distance >=
-            kMinButteraugliDistanceForProgressiveDc) {
-      cparams.progressive_dc = 1;
-    }
   }
   if (cparams.ec_resampling < cparams.resampling) {
     cparams.ec_resampling = cparams.resampling;
@@ -1210,7 +1230,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   metadata_linear->color_encoding = c_linear;
   ImageBundle linear_storage(metadata_linear.get());
 
-  std::vector<AuxOut> aux_outs;
+  std::vector<std::unique_ptr<AuxOut>> aux_outs;
   // LossyFrameEncoder stores a reference to a std::function<Status(size_t)>
   // so we need to keep the std::function<Status(size_t)> being referenced
   // alive while lossy_frame_encoder is used. We could make resize_aux_outs a
@@ -1218,18 +1238,15 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   // simpler.
   const std::function<Status(size_t)> resize_aux_outs =
       [&aux_outs, aux_out](const size_t num_threads) -> Status {
-    if (aux_out != nullptr) {
-      size_t old_size = aux_outs.size();
-      for (size_t i = num_threads; i < old_size; i++) {
-        aux_out->Assimilate(aux_outs[i]);
-      }
+    if (aux_out == nullptr) {
       aux_outs.resize(num_threads);
-      // Each thread needs these INPUTS. Don't copy the entire AuxOut
-      // because it may contain stats which would be Assimilated multiple
-      // times below.
-      for (size_t i = old_size; i < aux_outs.size(); i++) {
-        aux_outs[i].dump_image = aux_out->dump_image;
-        aux_outs[i].debug_prefix = aux_out->debug_prefix;
+    } else {
+      while (aux_outs.size() > num_threads) {
+        aux_out->Assimilate(*aux_outs.back());
+        aux_outs.pop_back();
+      }
+      while (num_threads > aux_outs.size()) {
+        aux_outs.emplace_back(jxl::make_unique<AuxOut>());
       }
     }
     return true;
@@ -1288,10 +1305,6 @@ Status EncodeFrame(const CompressParams& cparams_orig,
                           ib.alpha(), lossless);
       }
     }
-    if (aux_out != nullptr) {
-      JXL_RETURN_IF_ERROR(
-          aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin));
-    }
     if (frame_header->encoding == FrameEncoding::kVarDCT) {
       PadImageToBlockMultipleInPlace(&opsin);
       JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(
@@ -1309,9 +1322,11 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   }
   if (cparams.ec_resampling != 1 && !cparams.already_downsampled) {
     extra_channels = &extra_channels_storage;
-    for (size_t i = 0; i < ib.extra_channels().size(); i++) {
-      extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i]));
-      DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling);
+    for (const ImageF& ec : ib.extra_channels()) {
+      ImageF d_ec(ec.xsize(), ec.ysize());
+      CopyImageTo(ec, &d_ec);
+      DownsampleImage(&d_ec, cparams.ec_resampling);
+      extra_channels_storage.emplace_back(std::move(d_ec));
     }
   }
   // needs to happen *AFTER* VarDCT-ComputeEncodingData.
@@ -1388,13 +1403,13 @@ Status EncodeFrame(const CompressParams& cparams_orig,
 
   const auto process_dc_group = [&](const uint32_t group_index,
                                     const size_t thread) {
-    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+    AuxOut* my_aux_out = aux_outs[thread].get();
     BitWriter* output = get_output(group_index + 1);
     if (frame_header->encoding == FrameEncoding::kVarDCT &&
         !(frame_header->flags & FrameHeader::kUseDcFrame)) {
       BitWriter::Allotment allotment(output, 2);
       output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]);
-      ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out);
+      allotment.ReclaimAndCharge(output, kLayerDC, my_aux_out);
       JXL_CHECK(modular_frame_encoder->EncodeStream(
           output, my_aux_out, kLayerDC,
           ModularStreamId::VarDCTDC(group_index)));
@@ -1410,7 +1425,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
         BitWriter::Allotment allotment(output, nb_bits);
         output->Write(nb_bits,
                       modular_frame_encoder->ac_metadata_size[group_index] - 1);
-        ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out);
+        allotment.ReclaimAndCharge(output, kLayerControlFields, my_aux_out);
       }
       JXL_CHECK(modular_frame_encoder->EncodeStream(
           output, my_aux_out, kLayerControlFields,
@@ -1429,7 +1444,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   std::atomic<int> num_errors{0};
   const auto process_group = [&](const uint32_t group_index,
                                  const size_t thread) {
-    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+    AuxOut* my_aux_out = aux_outs[thread].get();
 
     for (size_t i = 0; i < num_passes; i++) {
       if (frame_header->encoding == FrameEncoding::kVarDCT) {
@@ -1458,7 +1473,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   for (BitWriter& bw : group_codes) {
     BitWriter::Allotment allotment(&bw, 8);
     bw.ZeroPadToByte();  // end of group.
-    ReclaimAndCharge(&bw, &allotment, kLayerAC, aux_out);
+    allotment.ReclaimAndCharge(&bw, kLayerAC, aux_out);
   }
 
   std::vector<coeff_order_t>* permutation_ptr = nullptr;