1 files changed, 2135 insertions, 0 deletions
diff --git a/src/pdfio1.c b/src/pdfio1.c
new file mode 100644
index 0000000..ac3f553
--- /dev/null
+++ b/src/pdfio1.c
@@ -0,0 +1,2135 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*
+ *  pdfio1.c
+ *
+ *    Higher-level operations for generating pdf.
+ *
+ *    |=============================================================|
+ *    |                         Important note                      |
+ *    |=============================================================|
+ *    | Some of these functions require libtiff, libjpeg, and libz  |
+ *    | If you do not have these libraries, you must set            |
+ *    |      #define  USE_PDFIO     0                               |
+ *    | in environ.h.  This will link pdfiostub.c                   |
+ *    |=============================================================|
+ *
+ *     Set 1. These functions convert a set of image files
+ *     to a multi-page pdf file, with one image on each page.
+ *     All images are rendered at the same (input) resolution.
+ *     The images can be specified as being in a directory, or they
+ *     can be in an sarray.  The output pdf can be either a file
+ *     or an array of bytes in memory.
+ *
+ *     Set 2. These functions are a special case of set 1, where
+ *     no scaling or change in quality is requires.  For jpeg and
+ *     jp2k images, the bytes in each jpeg file can be directly
+ *     incorporated into the output pdf, and the wrapping up of
+ *     multiple image files is very fast.  For non-interlaced png,
+ *     the data bytes including the predictors can also be written
+ *     directly into the flate pdf data.  For other image formats,
+ *     transcoding is required, where the image data is first
+ *     decompressed and then the G4 or Flate (gzip) encodings are generated.
+ *
+ *     Set 3. These functions convert a set of images in memory
+ *     to a multi-page pdf, with one image on each page.  The pdf
+ *     output can be either a file or an array of bytes in memory.
+ *
+ *     Set 4. These functions implement a pdf output "device driver"
+ *     for wrapping (encoding) any number of images on a single page
+ *     in pdf.  The input can be either an image file or a Pix;
+ *     the pdf output can be either a file or an array of bytes in memory.
+ *
+ *     Set 5. These "segmented" functions take a set of image
+ *     files, along with optional segmentation information, and
+ *     generate a multi-page pdf file, where each page consists
+ *     in general of a mixed raster pdf of image and non-image regions.
+ *     The segmentation information for each page can be input as
+ *     either a mask over the image parts, or as a Boxa of those
+ *     regions.
+ *
+ *     Set 6. These "segmented" functions convert an image and
+ *     an optional Boxa of image regions into a mixed raster pdf file
+ *     for the page.  The input image can be either a file or a Pix.
+ *
+ *     Set 7. These functions take a set of single-page pdf files
+ *     and concatenates them into a multi-page pdf.
+ *     The input can be a set of single page pdf files, or of
+ *     pdf 'strings' in memory.  The output can be either a file or
+ *     an array of bytes in memory.
+ *
+ *     The images in the pdf file can be rendered using a pdf viewer,
+ *     such as gv, evince, xpdf or acroread.
+ *
+ *     Reference on the pdf file format:
+ *         http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
+ *
+ *     1. Convert specified image files to pdf (one image file per page)
+ *          l_int32             convertFilesToPdf()
+ *          l_int32             saConvertFilesToPdf()
+ *          l_int32             saConvertFilesToPdfData()
+ *          l_int32             selectDefaultPdfEncoding()
+ *
+ *     2. Convert specified image files to pdf without scaling
+ *          l_int32             convertUnscaledFilesToPdf()
+ *          l_int32             saConvertUnscaledFilesToPdf()
+ *          l_int32             saConvertUnscaledFilesToPdfData()
+ *          l_int32             convertUnscaledToPdfData()
+ *
+ *     3. Convert multiple images to pdf (one image per page)
+ *          l_int32             pixaConvertToPdf()
+ *          l_int32             pixaConvertToPdfData()
+ *
+ *     4. Single page, multi-image converters
+ *          l_int32             convertToPdf()
+ *          l_int32             convertImageDataToPdf()
+ *          l_int32             convertToPdfData()
+ *          l_int32             convertImageDataToPdfData()
+ *          l_int32             pixConvertToPdf()
+ *          l_int32             pixWriteStreamPdf()
+ *          l_int32             pixWriteMemPdf()
+ *
+ *     5. Segmented multi-page, multi-image converter
+ *          l_int32             convertSegmentedFilesToPdf()
+ *          BOXAA              *convertNumberedMasksToBoxaa()
+ *
+ *     6. Segmented single page, multi-image converters
+ *          l_int32             convertToPdfSegmented()
+ *          l_int32             pixConvertToPdfSegmented()
+ *          l_int32             convertToPdfDataSegmented()
+ *          l_int32             pixConvertToPdfDataSegmented()
+ *
+ *     7. Multipage concatenation
+ *          l_int32             concatenatePdf()
+ *          l_int32             saConcatenatePdf()
+ *          l_int32             ptraConcatenatePdf()
+ *          l_int32             concatenatePdfToData()
+ *          l_int32             saConcatenatePdfToData()
+ *
+ *     The top-level multi-image functions can be visualized as follows:
+ *          Output pdf data to file:
+ *             convertToPdf()  and  convertImageDataToPdf()
+ *                     --> pixConvertToPdf()
+ *                           --> pixConvertToPdfData()
+ *
+ *          Output pdf data to array in memory:
+ *             convertToPdfData()  and  convertImageDataToPdfData()
+ *                     --> pixConvertToPdfData()
+ *
+ *     The top-level segmented image functions can be visualized as follows:
+ *          Output pdf data to file:
+ *             convertToPdfSegmented()
+ *                     --> pixConvertToPdfSegmented()
+ *                           --> pixConvertToPdfDataSegmented()
+ *
+ *          Output pdf data to array in memory:
+ *             convertToPdfDataSegmented()
+ *                     --> pixConvertToPdfDataSegmented()
+ *
+ *     For multi-page concatenation, there are three different types of input
+ *        (1) directory and optional filename filter
+ *        (2) sarray of filenames
+ *        (3) ptra of byte arrays of pdf data
+ *     and two types of output for the concatenated pdf data
+ *        (1) filename
+ *        (2) data array and size
+ *     High-level interfaces are given for each of the six combinations.
+ *
+ *     Note: When wrapping small images into pdf, it is useful to give
+ *     them a relatively low resolution value, to avoid rounding errors
+ *     when rendering the images.  For example, if you want an image
+ *     of width w pixels to be 5 inches wide on a screen, choose a
+ *     resolution w/5.
+ *
+ *     The very fast functions in section (2) require neither transcoding
+ *     nor parsing of the compressed jpeg file.  With three types of image
+ *     compression, the compressed strings can be incorporated into
+ *     the pdf data without decompression and re-encoding: jpeg, jp2k
+ *     and png.  The DCTDecode and JPXDecode filters can handle the
+ *     entire jpeg and jp2k encoded string as a byte array in the pdf file.
+ *     The FlateDecode filter can handle the png compressed image data,
+ *     including predictors that occur as the first byte in each
+ *     raster line, but it is necessary to store only the png IDAT chunk
+ *     data in the pdf array.  The alternative for wrapping png images
+ *     is to uncompress into a raster (a pix) and then gzip the raster data.
+ *     This typically results in a larger pdf file, because it doesn't
+ *     use the two-dimensional png predictor.  Colormaps, which are found
+ *     in png PLTE chunks, must always be pulled out and included separately
+ *     in the pdf.  For CCITT-G4 compression, you can not simply
+ *     include a tiff G4 file -- you must either parse it and extract the
+ *     G4 compressed data within it, or uncompress to a raster and
+ *     G4 compress again.
+ */
+
+#include <string.h>
+#include <math.h>
+#include "allheaders.h"
+
+/* --------------------------------------------*/
+#if  USE_PDFIO   /* defined in environ.h */
+ /* --------------------------------------------*/
+
+    /* Typical scan resolution in ppi (pixels/inch) */
+static const l_int32  DEFAULT_INPUT_RES = 300;
+
+
+/*---------------------------------------------------------------------*
+ *    Convert specified image files to pdf (one image file per page)   *
+ *---------------------------------------------------------------------*/
+/*!
+ *  convertFilesToPdf()
+ *
+ *      Input:  directory name (containing images)
+ *              substr (<optional> substring filter on filenames; can be NULL)
+ *              res (input resolution of all images)
+ *              scalefactor (scaling factor applied to each image; > 0.0)
+ *              type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                    L_FLATE_ENCODE, or 0 for default)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              fileout (pdf file of all images)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @substr is not NULL, only image filenames that contain
+ *          the substring can be used.  If @substr == NULL, all files
+ *          in the directory are used.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ *      (3) The scalefactor is applied to each image before encoding.
+ *          If you enter a value <= 0.0, it will be set to 1.0.
+ *      (4) Specifying one of the three encoding types for @type forces
+ *          all images to be compressed with that type.  Use 0 to have
+ *          the type determined for each image based on depth and whether
+ *          or not it has a colormap.
+ */
+l_int32
+convertFilesToPdf(const char  *dirname,
+                  const char  *substr,
+                  l_int32      res,
+                  l_float32    scalefactor,
+                  l_int32      type,
+                  l_int32      quality,
+                  const char  *title,
+                  const char  *fileout)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    PROCNAME("convertFilesToPdf");
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", procName, 1);
+    ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
+                              title, fileout);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ *  saConvertFilesToPdf()
+ *
+ *      Input:  sarray (of pathnames for images)
+ *              res (input resolution of all images)
+ *              scalefactor (scaling factor applied to each image; > 0.0)
+ *              type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                    L_FLATE_ENCODE, or 0 for default)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              fileout (pdf file of all images)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) See convertFilesToPdf().
+ */
+l_int32
+saConvertFilesToPdf(SARRAY      *sa,
+                    l_int32      res,
+                    l_float32    scalefactor,
+                    l_int32      type,
+                    l_int32      quality,
+                    const char  *title,
+                    const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("saConvertFilesToPdf");
+
+    if (!sa)
+        return ERROR_INT("sa not defined", procName, 1);
+
+    ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
+                                  title, &data, &nbytes);
+    if (ret) {
+        if (data) LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", procName, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", procName);
+    return ret;
+}
+
+
+/*!
+ *  saConvertFilesToPdfData()
+ *
+ *      Input:  sarray (of pathnames for images)
+ *              res (input resolution of all images)
+ *              scalefactor (scaling factor applied to each image; > 0.0)
+ *              type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                    L_FLATE_ENCODE, or 0 for default)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              &data (<return> output pdf data (of all images)
+ *              &nbytes (<return> size of output pdf data)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) See convertFilesToPdf().
+ */
+l_int32
+saConvertFilesToPdfData(SARRAY      *sa,
+                        l_int32      res,
+                        l_float32    scalefactor,
+                        l_int32      type,
+                        l_int32      quality,
+                        const char  *title,
+                        l_uint8    **pdata,
+                        size_t      *pnbytes)
+{
+char        *fname;
+const char  *pdftitle;
+l_uint8     *imdata;
+l_int32      i, n, ret, pagetype, npages, scaledres;
+size_t       imbytes;
+L_BYTEA     *ba;
+PIX         *pixs, *pix;
+L_PTRA      *pa_data;
+
+    PROCNAME("saConvertFilesToPdfData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!sa)
+        return ERROR_INT("sa not defined", procName, 1);
+    if (scalefactor <= 0.0) scalefactor = 1.0;
+    if (type < 0 || type > L_FLATE_ENCODE) {
+        L_WARNING("invalid compression type; using per-page default\n",
+                  procName);
+        type = 0;
+    }
+
+        /* Generate all the encoded pdf strings */
+    n = sarrayGetCount(sa);
+    pa_data = ptraCreate(n);
+    pdftitle = NULL;
+    for (i = 0; i < n; i++) {
+        if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if ((pixs = pixRead(fname)) == NULL) {
+            L_ERROR("image not readable from file %s\n", procName, fname);
+            continue;
+        }
+        if (!pdftitle)
+            pdftitle = (title) ? title : fname;
+        if (scalefactor != 1.0)
+            pix = pixScale(pixs, scalefactor, scalefactor);
+        else
+            pix = pixClone(pixs);
+        scaledres = (l_int32)(res * scalefactor);
+        if (type != 0) {
+            pagetype = type;
+        } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
+            L_ERROR("encoding type selection failed for file %s\n",
+                    procName, fname);
+            continue;
+        }
+        ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
+                                  0, 0, scaledres, pdftitle, NULL, 0);
+        pixDestroy(&pix);
+        pixDestroy(&pixs);
+        if (ret) {
+            L_ERROR("pdf encoding failed for %s\n", procName, fname);
+            continue;
+        }
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        if (imdata) LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    ptraGetActualCount(pa_data, &npages);
+    if (npages == 0) {
+        L_ERROR("no pdf files made\n", procName);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate them */
+    fprintf(stderr, "\nconcatenating ... ");
+    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+    fprintf(stderr, "done\n");
+
+    ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
+    for (i = 0; i < npages; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+
+/*!
+ *  selectDefaultPdfEncoding()
+ *
+ *      Input:  pix
+ *              &type (<return> L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *
+ *  Notes:
+ *      (1) This attempts to choose an encoding for the pix that results
+ *          in the smallest file, assuming that if jpeg encoded, it will
+ *          use quality = 75.  The decision is approximate, in that
+ *          (a) all colormapped images will be losslessly encoded with
+ *          gzip (flate), and (b) an image with less than about 20 colors
+ *          is likely to be smaller if flate encoded than if encoded
+ *          as a jpeg (dct).  For example, an image made by pixScaleToGray3()
+ *          will have 10 colors, and flate encoding will give about
+ *          twice the compression as jpeg with quality = 75.
+ */
+l_int32
+selectDefaultPdfEncoding(PIX      *pix,
+                         l_int32  *ptype)
+{
+l_int32   w, h, d, factor, ncolors;
+PIXCMAP  *cmap;
+
+    PROCNAME("selectDefaultPdfEncoding");
+
+    if (!pix)
+        return ERROR_INT("pix not defined", procName, 1);
+    if (!ptype)
+        return ERROR_INT("&type not defined", procName, 1);
+    *ptype = L_FLATE_ENCODE;  /* default universal encoding */
+    pixGetDimensions(pix, &w, &h, &d);
+    cmap = pixGetColormap(pix);
+    if (d == 8 && !cmap) {
+        factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
+        pixNumColors(pix, factor, &ncolors);
+        if (ncolors < 20)
+            *ptype = L_FLATE_ENCODE;
+        else
+            *ptype = L_JPEG_ENCODE;
+    } else if (d == 1) {
+        *ptype = L_G4_ENCODE;
+    } else if (cmap || d == 2 || d == 4) {
+        *ptype = L_FLATE_ENCODE;
+    } else if (d == 8 || d == 32) {
+        *ptype = L_JPEG_ENCODE;
+    } else {
+        return ERROR_INT("type selection failure", procName, 1);
+    }
+
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *          Convert specified image files to pdf without scaling       *
+ *---------------------------------------------------------------------*/
+/*!
+ *  convertUnscaledFilesToPdf()
+ *
+ *      Input:  directory name (containing images)
+ *              substr (<optional> substring filter on filenames; can be NULL)
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              fileout (pdf file of all images)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @substr is not NULL, only image filenames that contain
+ *          the substring can be used.  If @substr == NULL, all files
+ *          in the directory are used.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ *      (3) For jpeg and jp2k, this is very fast because the compressed
+ *          data is wrapped up and concatenated.  For png and tiffg4,
+ *          the images must be read and recompressed.
+ */
+l_int32
+convertUnscaledFilesToPdf(const char  *dirname,
+                          const char  *substr,
+                          const char  *title,
+                          const char  *fileout)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    PROCNAME("convertUnscaledFilesToPdf");
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", procName, 1);
+    ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ *  saConvertUnscaledFilesToPdf()
+ *
+ *      Input:  sarray (of pathnames for images)
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              fileout (pdf file of all images)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) See convertUnscaledFilesToPdf().
+ */
+l_int32
+saConvertUnscaledFilesToPdf(SARRAY      *sa,
+                            const char  *title,
+                            const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("saConvertUnscaledFilesToPdf");
+
+    if (!sa)
+        return ERROR_INT("sa not defined", procName, 1);
+
+    ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
+    if (ret) {
+        if (data) LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", procName, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", procName);
+    return ret;
+}
+
+
+/*!
+ *  saConvertUnscaledFilesToPdfData()
+ *
+ *      Input:  sarray (of pathnames for images)
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              &data (<return> output pdf data (of all images)
+ *              &nbytes (<return> size of output pdf data)
+ *      Return: 0 if OK, 1 on error
+ */
+l_int32
+saConvertUnscaledFilesToPdfData(SARRAY      *sa,
+                                const char  *title,
+                                l_uint8    **pdata,
+                                size_t      *pnbytes)
+{
+char         *fname;
+l_uint8      *imdata;
+l_int32       i, n, ret, npages;
+size_t        imbytes;
+L_BYTEA      *ba;
+L_PTRA       *pa_data;
+
+    PROCNAME("saConvertUnscaledFilesToPdfData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!sa)
+        return ERROR_INT("sa not defined", procName, 1);
+
+        /* Generate all the encoded pdf strings */
+    n = sarrayGetCount(sa);
+    pa_data = ptraCreate(n);
+    for (i = 0; i < n; i++) {
+        if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+
+            /* Generate the pdf data */
+        if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
+            continue;
+
+            /* ... and add it to the array of single page data */
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        if (imdata) LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    ptraGetActualCount(pa_data, &npages);
+    if (npages == 0) {
+        L_ERROR("no pdf files made\n", procName);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate to generate a multipage pdf */
+    fprintf(stderr, "\nconcatenating ... ");
+    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+    fprintf(stderr, "done\n");
+
+        /* Clean up */
+    ptraGetActualCount(pa_data, &npages);  /* maybe failed to read some files */
+    for (i = 0; i < npages; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+
+/*!
+ *  convertUnscaledToPdfData()
+ *
+ *      Input:  fname (of image file)
+ *              title (<optional> pdf title; can be NULL)
+ *              &data (<return> output pdf data for image)
+ *              &nbytes (<return> size of output pdf data)
+ *      Return: 0 if OK, 1 on error
+ */
+l_int32
+convertUnscaledToPdfData(const char  *fname,
+                         const char  *title,
+                         l_uint8    **pdata,
+                         size_t      *pnbytes)
+{
+const char   *pdftitle = NULL;
+char         *tail = NULL;
+l_int32       format;
+L_COMP_DATA  *cid;
+
+    PROCNAME("convertUnscaledToPdfData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!fname)
+        return ERROR_INT("fname not defined", procName, 1);
+
+    findFileFormat(fname, &format);
+    if (format == IFF_UNKNOWN) {
+        L_WARNING("file %s format is unknown; skip\n", procName, fname);
+        return 1;
+    }
+    if (format == IFF_PS || format == IFF_LPDF) {
+        L_WARNING("file %s format is %d; skip\n", procName, fname, format);
+        return 1;
+    }
+
+        /* Generate the image data required for pdf generation, always
+         * in binary (not ascii85) coding; jpeg files are never transcoded.  */
+    l_generateCIDataForPdf(fname, NULL, 0, &cid);
+    if (!cid) {
+        L_ERROR("file %s format is %d; unreadable\n", procName, fname, format);
+        return 1;
+    }
+
+        /* If @title == NULL, use the tail of @fname. */
+    if (title) {
+        pdftitle = title;
+    } else {
+        splitPathAtDirectory(fname, NULL, &tail);
+        pdftitle = tail;
+    }
+
+        /* Generate the pdf string for this page (image).  This destroys
+         * the cid by attaching it to an lpd and destroying the lpd. */
+    cidConvertToPdfData(cid, pdftitle, pdata, pnbytes);
+    LEPT_FREE(tail);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *          Convert multiple images to pdf (one image per page)        *
+ *---------------------------------------------------------------------*/
+/*!
+ *  pixaConvertToPdf()
+ *
+ *      Input:  pixa (containing images all at the same resolution)
+ *              res (override the resolution of each input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              scalefactor (scaling factor applied to each image; > 0.0)
+ *              type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                    L_FLATE_ENCODE, or 0 for default)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              fileout (pdf file of all images)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
+ *          colormap and many colors, or 32 bpp; FLATE for anything else.
+ *      (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
+ *      (3) Specifying one of the three encoding types for @type forces
+ *          all images to be compressed with that type.  Use 0 to have
+ *          the type determined for each image based on depth and whether
+ *          or not it has a colormap.
+ */
+l_int32
+pixaConvertToPdf(PIXA        *pixa,
+                 l_int32      res,
+                 l_float32    scalefactor,
+                 l_int32      type,
+                 l_int32      quality,
+                 const char  *title,
+                 const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("pixaConvertToPdf");
+
+    if (!pixa)
+        return ERROR_INT("pixa not defined", procName, 1);
+
+    ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
+                               title, &data, &nbytes);
+    if (ret) {
+        LEPT_FREE(data);
+        return ERROR_INT("conversion to pdf failed", procName, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", procName);
+    return ret;
+}
+
+
+/*!
+ *  pixaConvertToPdfData()
+ *
+ *      Input:  pixa (containing images all at the same resolution)
+ *              res (input resolution of all images)
+ *              scalefactor (scaling factor applied to each image; > 0.0)
+ *              type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ *                    L_FLATE_ENCODE, or 0 for default)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              title (<optional> pdf title)
+ *              &data (<return> output pdf data (of all images)
+ *              &nbytes (<return> size of output pdf data)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) See pixaConvertToPdf().
+ */
+l_int32
+pixaConvertToPdfData(PIXA        *pixa,
+                     l_int32      res,
+                     l_float32    scalefactor,
+                     l_int32      type,
+                     l_int32      quality,
+                     const char  *title,
+                     l_uint8    **pdata,
+                     size_t      *pnbytes)
+{
+l_uint8  *imdata;
+l_int32   i, n, ret, scaledres, pagetype;
+size_t    imbytes;
+L_BYTEA  *ba;
+PIX      *pixs, *pix;
+L_PTRA   *pa_data;
+
+    PROCNAME("pixaConvertToPdfData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!pixa)
+        return ERROR_INT("pixa not defined", procName, 1);
+    if (scalefactor <= 0.0) scalefactor = 1.0;
+    if (type < 0 || type > L_FLATE_ENCODE) {
+        L_WARNING("invalid compression type; using per-page default\n",
+                  procName);
+        type = 0;
+    }
+
+        /* Generate all the encoded pdf strings */
+    n = pixaGetCount(pixa);
+    pa_data = ptraCreate(n);
+    for (i = 0; i < n; i++) {
+        if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
+            L_ERROR("pix[%d] not retrieved\n", procName, i);
+            continue;
+        }
+        if (scalefactor != 1.0)
+            pix = pixScale(pixs, scalefactor, scalefactor);
+        else
+            pix = pixClone(pixs);
+        pixDestroy(&pixs);
+        scaledres = (l_int32)(res * scalefactor);
+        if (type != 0) {
+            pagetype = type;
+        } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
+            L_ERROR("encoding type selection failed for pix[%d]\n",
+                        procName, i);
+            pixDestroy(&pix);
+            continue;
+        }
+        ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
+                                  0, 0, scaledres, title, NULL, 0);
+        pixDestroy(&pix);
+        if (ret) {
+            L_ERROR("pdf encoding failed for pix[%d]\n", procName, i);
+            continue;
+        }
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        if (imdata) LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    ptraGetActualCount(pa_data, &n);
+    if (n == 0) {
+        L_ERROR("no pdf files made\n", procName);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate them */
+    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+
+    ptraGetActualCount(pa_data, &n);  /* recalculate in case it changes */
+    for (i = 0; i < n; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                Single page, multi-image converters                  *
+ *---------------------------------------------------------------------*/
+/*!
+ *  convertToPdf()
+ *
+ *      Input:  filein (input image file -- any format)
+ *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              fileout (output pdf file; only required on last image on page)
+ *              x, y (location of lower-left corner of image, in pixels,
+ *                    relative to the PostScript origin (0,0) at
+ *                    the lower-left corner of the page)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title; if null, taken from filein)
+ *              &lpd (ptr to lpd, which is created on the first invocation
+ *                    and returned until last image is processed, at which
+ *                    time it is destroyed)
+ *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                       L_LAST_IMAGE)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) To wrap only one image in pdf, input @plpd = NULL, and
+ *          the value of @position will be ignored:
+ *            convertToPdf(...  type, quality, x, y, res, NULL, 0);
+ *      (2) To wrap multiple images on a single pdf page, this is called
+ *          once for each successive image.  Do it this way:
+ *            L_PDF_DATA   *lpd;
+ *            convertToPdf(...  type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
+ *            convertToPdf(...  type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
+ *            ...
+ *            convertToPdf(...  type, quality, x, y, res, &lpd, L_LAST_IMAGE);
+ *          This will write the result to the value of @fileout specified
+ *          in the first call; succeeding values of @fileout are ignored.
+ *          On the last call: the pdf data bytes are computed and written
+ *          to @fileout, lpd is destroyed internally, and the returned
+ *          value of lpd is null.  So the client has nothing to clean up.
+ *      (3) (a) Set @res == 0 to respect the resolution embedded in the
+ *              image file.  If no resolution is embedded, it will be set
+ *              to the default value.
+ *          (b) Set @res to some other value to override the file resolution.
+ *      (4) (a) If the input @res and the resolution of the output device
+ *              are equal, the image will be "displayed" at the same size
+ *              as the original.
+ *          (b) If the input @res is 72, the output device will render
+ *              the image at 1 pt/pixel.
+ *          (c) Some possible choices for the default input pix resolution are:
+ *                 72 ppi     Render pix on any output device at one pt/pixel
+ *                 96 ppi     Windows default for generated display images
+ *                300 ppi     Typical default for scanned images.
+ *              We choose 300, which is sensible for rendering page images.
+ *              However,  images come from a variety of sources, and
+ *              some are explicitly created for viewing on a display.
+ */
+l_int32
+convertToPdf(const char   *filein,
+             l_int32       type,
+             l_int32       quality,
+             const char   *fileout,
+             l_int32       x,
+             l_int32       y,
+             l_int32       res,
+             const char   *title,
+             L_PDF_DATA  **plpd,
+             l_int32       position)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("convertToPdf");
+
+    if (!filein)
+        return ERROR_INT("filein not defined", procName, 1);
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        if (!fileout)
+            return ERROR_INT("fileout not defined", procName, 1);
+    }
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+
+    if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
+                         res, title, plpd, position))
+        return ERROR_INT("pdf data not made", procName, 1);
+
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        ret = l_binaryWrite(fileout, "w", data, nbytes);
+        LEPT_FREE(data);
+        if (ret)
+            return ERROR_INT("pdf data not written to file", procName, 1);
+    }
+
+    return 0;
+}
+
+
+/*!
+ *  convertImageDataToPdf()
+ *
+ *      Input:  imdata (array of formatted image data; e.g., png, jpeg)
+ *              size (size of image data)
+ *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              fileout (output pdf file; only required on last image on page)
+ *              x, y (location of lower-left corner of image, in pixels,
+ *                    relative to the PostScript origin (0,0) at
+ *                    the lower-left corner of the page)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title)
+ *              &lpd (ptr to lpd, which is created on the first invocation
+ *                    and returned until last image is processed, at which
+ *                    time it is destroyed)
+ *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                       L_LAST_IMAGE)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @res == 0 and the input resolution field is 0,
+ *          this will use DEFAULT_INPUT_RES.
+ *      (2) See comments in convertToPdf().
+ */
+l_int32
+convertImageDataToPdf(l_uint8      *imdata,
+                      size_t        size,
+                      l_int32       type,
+                      l_int32       quality,
+                      const char   *fileout,
+                      l_int32       x,
+                      l_int32       y,
+                      l_int32       res,
+                      const char   *title,
+                      L_PDF_DATA  **plpd,
+                      l_int32       position)
+{
+l_int32  ret;
+PIX     *pix;
+
+    PROCNAME("convertImageDataToPdf");
+
+    if (!imdata)
+        return ERROR_INT("image data not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        if (!fileout)
+            return ERROR_INT("fileout not defined", procName, 1);
+    }
+
+    if ((pix = pixReadMem(imdata, size)) == NULL)
+        return ERROR_INT("pix not read", procName, 1);
+    ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
+                          title, plpd, position);
+    pixDestroy(&pix);
+    return ret;
+}
+
+
+/*!
+ *  convertToPdfData()
+ *
+ *      Input:  filein (input image file -- any format)
+ *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              &data (<return> pdf data in memory)
+ *              &nbytes (<return> number of bytes in pdf data)
+ *              x, y (location of lower-left corner of image, in pixels,
+ *                    relative to the PostScript origin (0,0) at
+ *                    the lower-left corner of the page)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title; if null, use filein)
+ *              &lpd (ptr to lpd, which is created on the first invocation
+ *                    and returned until last image is processed, at which
+ *                    time it is destroyed)
+ *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                       L_LAST_IMAGE)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @res == 0 and the input resolution field is 0,
+ *          this will use DEFAULT_INPUT_RES.
+ *      (2) See comments in convertToPdf().
+ */
+l_int32
+convertToPdfData(const char   *filein,
+                 l_int32       type,
+                 l_int32       quality,
+                 l_uint8     **pdata,
+                 size_t       *pnbytes,
+                 l_int32       x,
+                 l_int32       y,
+                 l_int32       res,
+                 const char   *title,
+                 L_PDF_DATA  **plpd,
+                 l_int32       position)
+{
+PIX  *pix;
+
+    PROCNAME("convertToPdfData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!filein)
+        return ERROR_INT("filein not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+
+    if ((pix = pixRead(filein)) == NULL)
+        return ERROR_INT("pix not made", procName, 1);
+
+    pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
+                        x, y, res, (title) ? title : filein, plpd, position);
+    pixDestroy(&pix);
+    return 0;
+}
+
+
+/*!
+ *  convertImageDataToPdfData()
+ *
+ *      Input:  imdata (array of formatted image data; e.g., png, jpeg)
+ *              size (size of image data)
+ *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              &data (<return> pdf data in memory)
+ *              &nbytes (<return> number of bytes in pdf data)
+ *              x, y (location of lower-left corner of image, in pixels,
+ *                    relative to the PostScript origin (0,0) at
+ *                    the lower-left corner of the page)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title)
+ *              &lpd (ptr to lpd, which is created on the first invocation
+ *                    and returned until last image is processed, at which
+ *                    time it is destroyed)
+ *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                       L_LAST_IMAGE)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @res == 0 and the input resolution field is 0,
+ *          this will use DEFAULT_INPUT_RES.
+ *      (2) See comments in convertToPdf().
+ */
+l_int32
+convertImageDataToPdfData(l_uint8      *imdata,
+                          size_t        size,
+                          l_int32       type,
+                          l_int32       quality,
+                          l_uint8     **pdata,
+                          size_t       *pnbytes,
+                          l_int32       x,
+                          l_int32       y,
+                          l_int32       res,
+                          const char   *title,
+                          L_PDF_DATA  **plpd,
+                          l_int32       position)
+{
+l_int32  ret;
+PIX     *pix;
+
+    PROCNAME("convertImageDataToPdfData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!imdata)
+        return ERROR_INT("image data not defined", procName, 1);
+    if (plpd) {  /* part of multi-page invocation */
+        if (position == L_FIRST_IMAGE)
+            *plpd = NULL;
+    }
+
+    if ((pix = pixReadMem(imdata, size)) == NULL)
+        return ERROR_INT("pix not read", procName, 1);
+    ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
+                              x, y, res, title, plpd, position);
+    pixDestroy(&pix);
+    return ret;
+}
+
+
+/*!
+ *  pixConvertToPdf()
+ *
+ *      Input:  pix
+ *              type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              fileout (output pdf file; only required on last image on page)
+ *              x, y (location of lower-left corner of image, in pixels,
+ *                    relative to the PostScript origin (0,0) at
+ *                    the lower-left corner of the page)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title)
+ *              &lpd (ptr to lpd, which is created on the first invocation
+ *                    and returned until last image is processed)
+ *              position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ *                       L_LAST_IMAGE)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @res == 0 and the input resolution field is 0,
+ *          this will use DEFAULT_INPUT_RES.
+ *      (2) This only writes data to fileout if it is the last
+ *          image to be written on the page.
+ *      (3) See comments in convertToPdf().
+ */
+l_int32
+pixConvertToPdf(PIX          *pix,
+                l_int32       type,
+                l_int32       quality,
+                const char   *fileout,
+                l_int32       x,
+                l_int32       y,
+                l_int32       res,
+                const char   *title,
+                L_PDF_DATA  **plpd,
+                l_int32       position)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("pixConvertToPdf");
+
+    if (!pix)
+        return ERROR_INT("pix not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        if (!fileout)
+            return ERROR_INT("fileout not defined", procName, 1);
+    }
+
+    if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
+                            x, y, res, title, plpd, position))
+        return ERROR_INT("pdf data not made", procName, 1);
+
+    if (!plpd || (position == L_LAST_IMAGE)) {
+        ret = l_binaryWrite(fileout, "w", data, nbytes);
+        LEPT_FREE(data);
+        if (ret)
+            return ERROR_INT("pdf data not written to file", procName, 1);
+    }
+    return 0;
+}
+
+
+/*!
+ *  pixWriteStreamPdf()
+ *
+ *      Input:  fp (stream opened for writing)
+ *              pix (all depths, cmap OK)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title; taken from the first image
+ *                     placed on a page; e.g., an input image filename)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This is the simplest interface for writing a single image
+ *          with pdf encoding to a stream.  It uses G4 encoding for 1 bpp,
+ *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
+ *          encoding for everything else.
+ */
+l_int32
+pixWriteStreamPdf(FILE        *fp,
+                  PIX         *pix,
+                  l_int32      res,
+                  const char  *title)
+{
+l_uint8  *data;
+size_t    nbytes, nbytes_written;
+
+    PROCNAME("pixWriteStreamPdf");
+
+    if (!fp)
+        return ERROR_INT("stream not opened", procName, 1);
+    if (!pix)
+        return ERROR_INT("pix not defined", procName, 1);
+
+    if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0)
+        return ERROR_INT("pdf data not made", procName, 1);
+
+    nbytes_written = fwrite(data, 1, nbytes, fp);
+    LEPT_FREE(data);
+    if (nbytes != nbytes_written)
+        return ERROR_INT("failure writing pdf data to stream", procName, 1);
+    return 0;
+}
+
+
+/*!
+ *  pixWriteMemPdf()
+ *
+ *      Input:  &data (<return> pdf as byte array)
+ *              &nbytes (<return> number of bytes in pdf array)
+ *              pix (all depths, cmap OK)
+ *              res (override the resolution of the input image, in ppi;
+ *                   use 0 to respect the resolution embedded in the input)
+ *              title (<optional> pdf title; taken from the first image
+ *                     placed on a page; e.g., an input image filename)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This is the simplest interface for writing a single image
+ *          with pdf encoding to memory.  It uses G4 encoding for 1 bpp,
+ *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
+ *          encoding for everything else.
+ */
+l_int32
+pixWriteMemPdf(l_uint8    **pdata,
+               size_t      *pnbytes,
+               PIX         *pix,
+               l_int32      res,
+               const char  *title)
+{
+l_int32   ret, d, type;
+PIXCMAP  *cmap;
+
+    PROCNAME("pixWriteMemPdf");
+
+    if (pdata) *pdata = NULL;
+    if (pnbytes) *pnbytes = 0;
+    if (!pdata || !pnbytes)
+        return ERROR_INT("&data or &nbytes not defined", procName, 1);
+    if (!pix)
+        return ERROR_INT("pix not defined", procName, 1);
+
+    d = pixGetDepth(pix);
+    cmap = pixGetColormap(pix);
+    if (d == 1)
+        type = L_G4_ENCODE;
+    else if (cmap || d == 2 || d == 4 || d == 16)
+        type = L_FLATE_ENCODE;
+    else  /* d == 8 (no cmap) or d == 32 */
+        type = L_JPEG_ENCODE;
+
+    ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
+                              0, 0, res, title, NULL, 0);
+    if (ret)
+        return ERROR_INT("pdf data not made", procName, 1);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *            Segmented multi-page, multi-image converter              *
+ *---------------------------------------------------------------------*/
+/*!
+ *  convertSegmentedFilesToPdf()
+ *
+ *      Input:  directory name (containing images)
+ *              substr (<optional> substring filter on filenames; can be NULL)
+ *              res (input resolution of all images)
+ *              type (compression type for non-image regions; the
+ *                    image regions are always compressed with L_JPEG_ENCODE)
+ *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ *              boxaa (<optional> of image regions)
+ *              quality (used for JPEG only; 0 for default (75))
+ *              scalefactor (scaling factor applied to each image region)
+ *              title (<optional> pdf title; if null, taken from the first
+ *                     image filename)
+ *              fileout (pdf file of all images)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If @substr is not NULL, only image filenames that contain
+ *          the substring can be used.  If @substr == NULL, all files
+ *          in the directory are used.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ *      (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
+ *          colormap and many colors, or 32 bpp; FLATE for anything else.
+ *      (4) The boxaa, if it exists, contains one boxa of "image regions"
+ *          for each image file.  The boxa must be aligned with the
+ *          sorted set of images.
+ *      (5) The scalefactor is applied to each image region.  It is
+ *          typically < 1.0, to save bytes in the final pdf, because
+ *          the resolution is often not critical in non-text regions.
+ *      (6) If the non-image regions have pixel depth > 1 and the encoding
+ *          type is G4, they are automatically scaled up by 2x and
+ *          thresholded.  Otherwise, no scaling is performed on them.
+ *      (7) Note that this function can be used to generate multipage
+ *          G4 compressed pdf from any input, by using @boxaa == NULL
+ *          and @type == L_G4_ENCODE.
+ */
+l_int32
+convertSegmentedFilesToPdf(const char  *dirname,
+                           const char  *substr,
+                           l_int32      res,
+                           l_int32      type,
+                           l_int32      thresh,
+                           BOXAA       *baa,
+                           l_int32      quality,
+                           l_float32    scalefactor,
+                           const char  *title,
+                           const char  *fileout)
+{
+char     *fname;
+l_uint8  *imdata, *data;
+l_int32   i, npages, nboxa, nboxes, ret;
+size_t    imbytes, databytes;
+BOXA     *boxa;
+L_BYTEA  *ba;
+L_PTRA   *pa_data;
+SARRAY   *sa;
+
+    PROCNAME("convertSegmentedFilesToPdf");
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+
+    if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
+            == NULL)
+        return ERROR_INT("sa not made", procName, 1);
+
+    npages = sarrayGetCount(sa);
+        /* If necessary, extend the boxaa, which is page-aligned with
+         * the image files, to be as large as the set of images. */
+    if (baa) {
+        nboxa = boxaaGetCount(baa);
+        if (nboxa < npages) {
+            boxa = boxaCreate(1);
+            boxaaExtendWithInit(baa, npages, boxa);
+            boxaDestroy(&boxa);
+        }
+    }
+
+        /* Generate and save all the encoded pdf strings */
+    pa_data = ptraCreate(npages);
+    for (i = 0; i < npages; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if (!strcmp(fname, "")) continue;
+        boxa = NULL;
+        if (baa) {
+            boxa = boxaaGetBoxa(baa, i, L_CLONE);
+            nboxes = boxaGetCount(boxa);
+            if (nboxes == 0)
+                boxaDestroy(&boxa);
+        }
+        ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
+                                        quality, scalefactor, title,
+                                        &imdata, &imbytes);
+        boxaDestroy(&boxa);  /* safe; in case nboxes > 0 */
+        if (ret) {
+            L_ERROR("pdf encoding failed for %s\n", procName, fname);
+            continue;
+        }
+        ba = l_byteaInitFromMem(imdata, imbytes);
+        if (imdata) LEPT_FREE(imdata);
+        ptraAdd(pa_data, ba);
+    }
+    sarrayDestroy(&sa);
+
+    ptraGetActualCount(pa_data, &npages);
+    if (npages == 0) {
+        L_ERROR("no pdf files made\n", procName);
+        ptraDestroy(&pa_data, FALSE, FALSE);
+        return 1;
+    }
+
+        /* Concatenate */
+    ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
+
+        /* Clean up */
+    ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
+    for (i = 0; i < npages; i++) {
+        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&ba);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+
+    if (ret) {
+        if (data) LEPT_FREE(data);
+        return ERROR_INT("pdf data not made", procName, 1);
+    }
+
+    ret = l_binaryWrite(fileout, "w", data, databytes);
+    LEPT_FREE(data);
+    if (ret)
+        L_ERROR("pdf data not written to file\n", procName);
+    return ret;
+}
+
+
+/*!
+ *  convertNumberedMasksToBoxaa()
+ *
+ *      Input:  directory name (containing mask images)
+ *              substr (<optional> substring filter on filenames; can be NULL)
+ *              numpre (number of characters in name before number)
+ *              numpost (number of characters in name after number, up
+ *                       to a dot before an extension)
+ *                       including an extension and the dot separator)
+ *      Return: boxaa of mask regions, or null on error
+ *
+ *  Notes:
+ *      (1) This is conveniently used to generate the input boxaa
+ *          for convertSegmentedFilesToPdf().  It guarantees that the
+ *          boxa will be aligned with the page images, even if some
+ *          of the boxa are empty.
+ */
+BOXAA *
+convertNumberedMasksToBoxaa(const char  *dirname,
+                            const char  *substr,
+                            l_int32      numpre,
+                            l_int32      numpost)
+{
+char    *fname;
+l_int32  i, n;
+BOXA    *boxa;
+BOXAA   *baa;
+PIX     *pix;
+SARRAY  *sa;
+
+    PROCNAME("convertNumberedMasksToBoxaa");
+
+    if (!dirname)
+        return (BOXAA *)ERROR_PTR("dirname not defined", procName, NULL);
+
+    if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
+                                              numpost, 10000)) == NULL)
+        return (BOXAA *)ERROR_PTR("sa not made", procName, NULL);
+
+        /* Generate and save all the encoded pdf strings */
+    n = sarrayGetCount(sa);
+    baa = boxaaCreate(n);
+    boxa = boxaCreate(1);
+    boxaaInitFull(baa, boxa);
+    boxaDestroy(&boxa);
+    for (i = 0; i < n; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        if (!strcmp(fname, "")) continue;
+        if ((pix = pixRead(fname)) == NULL) {
+            L_WARNING("invalid image on page %d\n", procName, i);
+            continue;
+        }
+        boxa = pixConnComp(pix, NULL, 8);
+        boxaaReplaceBoxa(baa, i, boxa);
+        pixDestroy(&pix);
+    }
+
+    sarrayDestroy(&sa);
+    return baa;
+}
+
+
+/*---------------------------------------------------------------------*
+ *            Segmented single page, multi-image converters            *
+ *---------------------------------------------------------------------*/
+/*!
+ *  convertToPdfSegmented()
+ *
+ *      Input:  filein (input image file -- any format)
+ *              res (input image resolution; typ. 300 ppi; use 0 for default)
+ *              type (compression type for non-image regions; the
+ *                    image regions are always compressed with L_JPEG_ENCODE)
+ *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ *              boxa (<optional> of image regions; can be null)
+ *              quality (used for jpeg image regions; 0 for default)
+ *              scalefactor (used for jpeg regions; must be <= 1.0)
+ *              title (<optional> pdf title; typically taken from the
+ *                     input file for the pix)
+ *              fileout (output pdf file)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If there are no image regions, set @boxa == NULL;
+ *          @quality and @scalefactor are ignored.
+ *      (2) Typically, @scalefactor is < 1.0, because the image regions
+ *          can be rendered at a lower resolution (for better compression)
+ *          than the text regions.  If @scalefactor == 0, we use 1.0.
+ *          If the input image is 1 bpp and scalefactor < 1.0, we
+ *          use scaleToGray() to downsample the image regions to gray
+ *          before compressing them.
+ *      (3) If the compression type for non-image regions is L_G4_ENCODE
+ *          and bpp > 1, the image is upscaled 2x and thresholded
+ *          to 1 bpp.  That is the only situation where @thresh is used.
+ *      (4) The parameter @quality is only used for image regions.
+ *          If @type == L_JPEG_ENCODE, default jpeg quality (75) is
+ *          used for the non-image regions.
+ *      (5) Processing matrix for non-image regions.
+ *
+ *          Input           G4              JPEG                FLATE
+ *          ----------|---------------------------------------------------
+ *          1 bpp     |  1x, 1 bpp       1x flate, 1 bpp     1x, 1 bpp
+ *                    |
+ *          cmap      |  2x, 1 bpp       1x flate, cmap      1x, cmap
+ *                    |
+ *          2,4 bpp   |  2x, 1 bpp       1x flate            1x, 2,4 bpp
+ *          no cmap   |                  2,4 bpp
+ *                    |
+ *          8,32 bpp  |  2x, 1 bpp       1x (jpeg)           1x, 8,32 bpp
+ *          no cmap   |                  8,32 bpp
+ *
+ *          Summary:
+ *          (a) if G4 is requested, G4 is used, with 2x upscaling
+ *              for all cases except 1 bpp.
+ *          (b) if JPEG is requested, use flate encoding for all cases
+ *              except 8 bpp without cmap and 32 bpp (rgb).
+ *          (c) if FLATE is requested, use flate with no transformation
+ *              of the raster data.
+ *      (6) Calling options/sequence for these functions:
+ *              file  -->  file      (convertToPdfSegmented)
+ *                  pix  -->  file      (pixConvertToPdfSegmented)
+ *                      pix  -->  data      (pixConvertToPdfDataSegmented)
+ *              file  -->  data      (convertToPdfDataSegmented)
+ *                      pix  -->  data      (pixConvertToPdfDataSegmented)
+ */
+l_int32
+convertToPdfSegmented(const char  *filein,
+                      l_int32      res,
+                      l_int32      type,
+                      l_int32      thresh,
+                      BOXA        *boxa,
+                      l_int32      quality,
+                      l_float32    scalefactor,
+                      const char  *title,
+                      const char  *fileout)
+{
+l_int32  ret;
+PIX     *pixs;
+
+    PROCNAME("convertToPdfSegmented");
+
+    if (!filein)
+        return ERROR_INT("filein not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+    if (boxa && scalefactor > 1.0) {
+        L_WARNING("setting scalefactor to 1.0\n", procName);
+        scalefactor = 1.0;
+    }
+
+    if ((pixs = pixRead(filein)) == NULL)
+        return ERROR_INT("pixs not made", procName, 1);
+
+    ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
+                                   scalefactor, (title) ? title : filein,
+                                   fileout);
+    pixDestroy(&pixs);
+    return ret;
+}
+
+
+/*!
+ *  pixConvertToPdfSegmented()
+ *
+ *      Input:  pixs (any depth, cmap OK)
+ *              res (input image resolution; typ. 300 ppi; use 0 for default)
+ *              type (compression type for non-image regions; the
+ *                    image regions are always compressed with L_JPEG_ENCODE)
+ *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ *              boxa (<optional> of image regions; can be null)
+ *              quality (used for jpeg image regions; 0 for default)
+ *              scalefactor (used for jpeg regions; must be <= 1.0)
+ *              title (<optional> pdf title; typically taken from the
+ *                     input file for the pix)
+ *              fileout (output pdf file)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) See convertToPdfSegmented() for details.
+ */
+l_int32
+pixConvertToPdfSegmented(PIX         *pixs,
+                         l_int32      res,
+                         l_int32      type,
+                         l_int32      thresh,
+                         BOXA        *boxa,
+                         l_int32      quality,
+                         l_float32    scalefactor,
+                         const char  *title,
+                         const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("pixConvertToPdfSegmented");
+
+    if (!pixs)
+        return ERROR_INT("pixs not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+    if (boxa && scalefactor > 1.0) {
+        L_WARNING("setting scalefactor to 1.0\n", procName);
+        scalefactor = 1.0;
+    }
+
+    ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
+                                       scalefactor, title, &data, &nbytes);
+    if (ret)
+        return ERROR_INT("pdf generation failure", procName, 1);
+
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    if (data) LEPT_FREE(data);
+    return ret;
+}
+
+
+/*!
+ *  convertToPdfDataSegmented()
+ *
+ *      Input:  filein (input image file -- any format)
+ *              res (input image resolution; typ. 300 ppi; use 0 for default)
+ *              type (compression type for non-image regions; the
+ *                    image regions are always compressed with L_JPEG_ENCODE)
+ *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ *              boxa (<optional> image regions; can be null)
+ *              quality (used for jpeg image regions; 0 for default)
+ *              scalefactor (used for jpeg regions; must be <= 1.0)
+ *              title (<optional> pdf title; if null, uses filein)
+ *              &data (<return> pdf data in memory)
+ *              &nbytes (<return> number of bytes in pdf data)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) If there are no image regions, set @boxa == NULL;
+ *          @quality and @scalefactor are ignored.
+ *      (2) Typically, @scalefactor is < 1.0.  The image regions are
+ */
+l_int32
+convertToPdfDataSegmented(const char  *filein,
+                          l_int32      res,
+                          l_int32      type,
+                          l_int32      thresh,
+                          BOXA        *boxa,
+                          l_int32      quality,
+                          l_float32    scalefactor,
+                          const char  *title,
+                          l_uint8    **pdata,
+                          size_t      *pnbytes)
+{
+l_int32  ret;
+PIX     *pixs;
+
+    PROCNAME("convertToPdfDataSegmented");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!filein)
+        return ERROR_INT("filein not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+    if (boxa && scalefactor > 1.0) {
+        L_WARNING("setting scalefactor to 1.0\n", procName);
+        scalefactor = 1.0;
+    }
+
+    if ((pixs = pixRead(filein)) == NULL)
+        return ERROR_INT("pixs not made", procName, 1);
+
+    ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
+                                       quality, scalefactor,
+                                       (title) ? title : filein,
+                                       pdata, pnbytes);
+    pixDestroy(&pixs);
+    return ret;
+}
+
+
+/*!
+ *  pixConvertToPdfDataSegmented()
+ *
+ *      Input:  pixs (any depth, cmap OK)
+ *              res (input image resolution; typ. 300 ppi; use 0 for default)
+ *              type (compression type for non-image regions; the
+ *                    image regions are always compressed with L_JPEG_ENCODE)
+ *              thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ *              boxa (<optional> of image regions; can be null)
+ *              quality (used for jpeg image regions; 0 for default)
+ *              scalefactor (used for jpeg regions; must be <= 1.0)
+ *              title (<optional> pdf title; typically taken from the
+ *                     input file for the pix)
+ *              &data (<return> pdf data in memory)
+ *              &nbytes (<return> number of bytes in pdf data)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) See convertToPdfSegmented() for details.
+ */
+l_int32
+pixConvertToPdfDataSegmented(PIX         *pixs,
+                             l_int32      res,
+                             l_int32      type,
+                             l_int32      thresh,
+                             BOXA        *boxa,
+                             l_int32      quality,
+                             l_float32    scalefactor,
+                             const char  *title,
+                             l_uint8    **pdata,
+                             size_t      *pnbytes)
+{
+l_int32      i, nbox, seq, bx, by, bw, bh, upscale;
+l_float32    scale;
+BOX         *box, *boxc, *box2;
+PIX         *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
+PIXCMAP     *cmap;
+L_PDF_DATA  *lpd;
+
+    PROCNAME("pixConvertToPdfDataSegmented");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!pixs)
+        return ERROR_INT("pixs not defined", procName, 1);
+    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+        type != L_FLATE_ENCODE)
+        return ERROR_INT("invalid conversion type", procName, 1);
+    if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
+        L_WARNING("setting scalefactor to 1.0\n", procName);
+        scalefactor = 1.0;
+    }
+
+        /* Adjust scalefactor so that the product with res gives an integer */
+    if (res <= 0)
+        res = DEFAULT_INPUT_RES;
+    scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
+    cmap = pixGetColormap(pixs);
+
+        /* Simple case: single image to be encoded */
+    if (!boxa || boxaGetCount(boxa) == 0) {
+        if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
+            if (cmap)
+                pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
+            else
+                pixt1 = pixConvertTo8(pixs, FALSE);
+            pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
+            pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
+                                0, 0, 2 * res, title, NULL, 0);
+            pixDestroy(&pixt1);
+            pixDestroy(&pixt2);
+        } else {
+            pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
+                                0, 0, res, title, NULL, 0);
+        }
+        return 0;
+    }
+
+        /* Multiple images to be encoded.  If @type == L_G4_ENCODE,
+         * jpeg encode a version of pixs that is blanked in the non-image
+         * regions, and paint the scaled non-image part onto it through a mask.
+         * Otherwise, we must put the non-image part down first and
+         * then render all the image regions separately on top of it,
+         * at their own resolution. */
+    pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE);  /* non-image */
+    nbox = boxaGetCount(boxa);
+    if (type == L_G4_ENCODE) {
+        pixt2 = pixCreateTemplate(pixs);  /* only image regions */
+        pixSetBlackOrWhite(pixt2, L_SET_WHITE);
+        for (i = 0; i < nbox; i++) {
+             box = boxaGetBox(boxa, i, L_CLONE);
+             pix = pixClipRectangle(pixs, box, &boxc);
+             boxGetGeometry(boxc, &bx, &by, &bw, &bh);
+             pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
+             pixDestroy(&pix);
+             boxDestroy(&box);
+             boxDestroy(&boxc);
+        }
+        pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
+        if (pixGetDepth(pixt3) == 1)
+            pixt4 = pixScaleToGray(pixt3, scale);
+        else
+            pixt4 = pixScale(pixt3, scale, scale);
+        pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
+                            0, 0, (l_int32)(scale * res), title,
+                            &lpd, L_FIRST_IMAGE);
+
+        if (pixGetDepth(pixt1) == 1) {
+            pixt5 = pixClone(pixt1);
+            upscale = 1;
+        } else {
+            pixt6 = pixConvertTo8(pixt1, 0);
+            pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
+            pixDestroy(&pixt6);
+            upscale = 2;
+        }
+        pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
+                            0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
+        pixDestroy(&pixt2);
+        pixDestroy(&pixt3);
+        pixDestroy(&pixt4);
+        pixDestroy(&pixt5);
+    } else {
+            /* Put the non-image part down first.  This is the full
+               size of the page, so we can use it to find the page
+               height in pixels, which is required for determining
+               the LL corner of the image relative to the LL corner
+               of the page. */
+        pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
+                            res, title, &lpd, L_FIRST_IMAGE);
+        for (i = 0; i < nbox; i++) {
+            box = boxaGetBox(boxa, i, L_CLONE);
+            pixt2 = pixClipRectangle(pixs, box, &boxc);
+            pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
+            if (pixGetDepth(pixt3) == 1)
+                pixt4 = pixScaleToGray(pixt3, scale);
+            else
+                pixt4 = pixScale(pixt3, scale, scale);
+            box2 = boxTransform(boxc, 0, 0, scale, scale);
+            boxGetGeometry(box2, &bx, &by, NULL, &bh);
+            seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
+            pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
+                                bx, by, (l_int32)(scale * res), title,
+                                &lpd, seq);
+            pixDestroy(&pixt2);
+            pixDestroy(&pixt3);
+            pixDestroy(&pixt4);
+            boxDestroy(&box);
+            boxDestroy(&boxc);
+            boxDestroy(&box2);
+        }
+    }
+
+    pixDestroy(&pixt1);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                         Multi-page concatenation                    *
+ *---------------------------------------------------------------------*/
+/*!
+ *  concatenatePdf()
+ *
+ *      Input:  directory name (containing single-page pdf files)
+ *              substr (<optional> substring filter on filenames; can be NULL)
+ *              fileout (concatenated pdf file)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ *      (2) If @substr is not NULL, only filenames that contain
+ *          the substring can be returned.  If @substr == NULL,
+ *          none of the filenames are filtered out.
+ *      (3) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ */
+l_int32
+concatenatePdf(const char  *dirname,
+               const char  *substr,
+               const char  *fileout)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    PROCNAME("concatenatePdf");
+
+    if (!dirname)
+        return ERROR_INT("dirname not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", procName, 1);
+    ret = saConcatenatePdf(sa, fileout);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ *  saConcatenatePdf()
+ *
+ *      Input:  sarray (of pathnames for single-page pdf files)
+ *              fileout (concatenated pdf file)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ */
+l_int32
+saConcatenatePdf(SARRAY      *sa,
+                 const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("saConcatenatePdf");
+
+    if (!sa)
+        return ERROR_INT("sa not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+
+    ret = saConcatenatePdfToData(sa, &data, &nbytes);
+    if (ret)
+        return ERROR_INT("pdf data not made", procName, 1);
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    return ret;
+}
+
+
+/*!
+ *  ptraConcatenatePdf()
+ *
+ *      Input:  ptra (array of pdf strings, each for a single-page pdf file)
+ *              fileout (concatenated pdf file)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ */
+l_int32
+ptraConcatenatePdf(L_PTRA      *pa,
+                   const char  *fileout)
+{
+l_uint8  *data;
+l_int32   ret;
+size_t    nbytes;
+
+    PROCNAME("ptraConcatenatePdf");
+
+    if (!pa)
+        return ERROR_INT("pa not defined", procName, 1);
+    if (!fileout)
+        return ERROR_INT("fileout not defined", procName, 1);
+
+    ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
+    if (ret)
+        return ERROR_INT("pdf data not made", procName, 1);
+    ret = l_binaryWrite(fileout, "w", data, nbytes);
+    LEPT_FREE(data);
+    return ret;
+}
+
+
+/*!
+ *  concatenatePdfToData()
+ *
+ *      Input:  directory name (containing single-page pdf files)
+ *              substr (<optional> substring filter on filenames; can be NULL)
+ *              &data (<return> concatenated pdf data in memory)
+ *              &nbytes (<return> number of bytes in pdf data)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ *      (2) If @substr is not NULL, only filenames that contain
+ *          the substring can be returned.  If @substr == NULL,
+ *          none of the filenames are filtered out.
+ *      (3) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order
+ *          before concatenation.
+ */
+l_int32
+concatenatePdfToData(const char  *dirname,
+                     const char  *substr,
+                     l_uint8    **pdata,
+                     size_t      *pnbytes)
+{
+l_int32  ret;
+SARRAY  *sa;
+
+    PROCNAME("concatenatePdfToData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!dirname)
+        return ERROR_INT("dirname not defined", procName, 1);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return ERROR_INT("sa not made", procName, 1);
+    ret = saConcatenatePdfToData(sa, pdata, pnbytes);
+    sarrayDestroy(&sa);
+    return ret;
+}
+
+
+/*!
+ *  saConcatenatePdfToData()
+ *
+ *      Input:  sarray (of pathnames for single-page pdf files)
+ *              &data (<return> concatenated pdf data in memory)
+ *              &nbytes (<return> number of bytes in pdf data)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This only works with leptonica-formatted single-page pdf files.
+ */
+l_int32
+saConcatenatePdfToData(SARRAY    *sa,
+                       l_uint8  **pdata,
+                       size_t    *pnbytes)
+{
+char     *fname;
+l_int32   i, npages, ret;
+L_BYTEA  *bas;
+L_PTRA   *pa_data;  /* input pdf data for each page */
+
+    PROCNAME("saConcatenatePdfToData");
+
+    if (!pdata)
+        return ERROR_INT("&data not defined", procName, 1);
+    *pdata = NULL;
+    if (!pnbytes)
+        return ERROR_INT("&nbytes not defined", procName, 1);
+    *pnbytes = 0;
+    if (!sa)
+        return ERROR_INT("sa not defined", procName, 1);
+
+        /* Read the pdf files into memory */
+    if ((npages = sarrayGetCount(sa)) == 0)
+        return ERROR_INT("no filenames found", procName, 1);
+    pa_data = ptraCreate(npages);
+    for (i = 0; i < npages; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        bas = l_byteaInitFromFile(fname);
+        ptraAdd(pa_data, bas);
+    }
+
+    ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
+
+        /* Cleanup: some pages could have been removed */
+    ptraGetActualCount(pa_data, &npages);
+    for (i = 0; i < npages; i++) {
+        bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+        l_byteaDestroy(&bas);
+    }
+    ptraDestroy(&pa_data, FALSE, FALSE);
+    return ret;
+}
+
+/* --------------------------------------------*/
+#endif  /* USE_PDFIO */
+/* --------------------------------------------*/