summaryrefslogtreecommitdiff
path: root/src/pdfio1.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/pdfio1.c')
-rw-r--r--src/pdfio1.c2135
1 files changed, 2135 insertions, 0 deletions
diff --git a/src/pdfio1.c b/src/pdfio1.c
new file mode 100644
index 0000000..ac3f553
--- /dev/null
+++ b/src/pdfio1.c
@@ -0,0 +1,2135 @@
+/*====================================================================*
+ - Copyright (C) 2001 Leptonica. All rights reserved.
+ -
+ - Redistribution and use in source and binary forms, with or without
+ - modification, are permitted provided that the following conditions
+ - are met:
+ - 1. Redistributions of source code must retain the above copyright
+ - notice, this list of conditions and the following disclaimer.
+ - 2. Redistributions in binary form must reproduce the above
+ - copyright notice, this list of conditions and the following
+ - disclaimer in the documentation and/or other materials
+ - provided with the distribution.
+ -
+ - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
+ - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*
+ * pdfio1.c
+ *
+ * Higher-level operations for generating pdf.
+ *
+ * |=============================================================|
+ * | Important note |
+ * |=============================================================|
+ * | Some of these functions require libtiff, libjpeg, and libz |
+ * | If you do not have these libraries, you must set |
+ * | #define USE_PDFIO 0 |
+ * | in environ.h. This will link pdfiostub.c |
+ * |=============================================================|
+ *
+ * Set 1. These functions convert a set of image files
+ * to a multi-page pdf file, with one image on each page.
+ * All images are rendered at the same (input) resolution.
+ * The images can be specified as being in a directory, or they
+ * can be in an sarray. The output pdf can be either a file
+ * or an array of bytes in memory.
+ *
+ * Set 2. These functions are a special case of set 1, where
+ * no scaling or change in quality is requires. For jpeg and
+ * jp2k images, the bytes in each jpeg file can be directly
+ * incorporated into the output pdf, and the wrapping up of
+ * multiple image files is very fast. For non-interlaced png,
+ * the data bytes including the predictors can also be written
+ * directly into the flate pdf data. For other image formats,
+ * transcoding is required, where the image data is first
+ * decompressed and then the G4 or Flate (gzip) encodings are generated.
+ *
+ * Set 3. These functions convert a set of images in memory
+ * to a multi-page pdf, with one image on each page. The pdf
+ * output can be either a file or an array of bytes in memory.
+ *
+ * Set 4. These functions implement a pdf output "device driver"
+ * for wrapping (encoding) any number of images on a single page
+ * in pdf. The input can be either an image file or a Pix;
+ * the pdf output can be either a file or an array of bytes in memory.
+ *
+ * Set 5. These "segmented" functions take a set of image
+ * files, along with optional segmentation information, and
+ * generate a multi-page pdf file, where each page consists
+ * in general of a mixed raster pdf of image and non-image regions.
+ * The segmentation information for each page can be input as
+ * either a mask over the image parts, or as a Boxa of those
+ * regions.
+ *
+ * Set 6. These "segmented" functions convert an image and
+ * an optional Boxa of image regions into a mixed raster pdf file
+ * for the page. The input image can be either a file or a Pix.
+ *
+ * Set 7. These functions take a set of single-page pdf files
+ * and concatenates them into a multi-page pdf.
+ * The input can be a set of single page pdf files, or of
+ * pdf 'strings' in memory. The output can be either a file or
+ * an array of bytes in memory.
+ *
+ * The images in the pdf file can be rendered using a pdf viewer,
+ * such as gv, evince, xpdf or acroread.
+ *
+ * Reference on the pdf file format:
+ * http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
+ *
+ * 1. Convert specified image files to pdf (one image file per page)
+ * l_int32 convertFilesToPdf()
+ * l_int32 saConvertFilesToPdf()
+ * l_int32 saConvertFilesToPdfData()
+ * l_int32 selectDefaultPdfEncoding()
+ *
+ * 2. Convert specified image files to pdf without scaling
+ * l_int32 convertUnscaledFilesToPdf()
+ * l_int32 saConvertUnscaledFilesToPdf()
+ * l_int32 saConvertUnscaledFilesToPdfData()
+ * l_int32 convertUnscaledToPdfData()
+ *
+ * 3. Convert multiple images to pdf (one image per page)
+ * l_int32 pixaConvertToPdf()
+ * l_int32 pixaConvertToPdfData()
+ *
+ * 4. Single page, multi-image converters
+ * l_int32 convertToPdf()
+ * l_int32 convertImageDataToPdf()
+ * l_int32 convertToPdfData()
+ * l_int32 convertImageDataToPdfData()
+ * l_int32 pixConvertToPdf()
+ * l_int32 pixWriteStreamPdf()
+ * l_int32 pixWriteMemPdf()
+ *
+ * 5. Segmented multi-page, multi-image converter
+ * l_int32 convertSegmentedFilesToPdf()
+ * BOXAA *convertNumberedMasksToBoxaa()
+ *
+ * 6. Segmented single page, multi-image converters
+ * l_int32 convertToPdfSegmented()
+ * l_int32 pixConvertToPdfSegmented()
+ * l_int32 convertToPdfDataSegmented()
+ * l_int32 pixConvertToPdfDataSegmented()
+ *
+ * 7. Multipage concatenation
+ * l_int32 concatenatePdf()
+ * l_int32 saConcatenatePdf()
+ * l_int32 ptraConcatenatePdf()
+ * l_int32 concatenatePdfToData()
+ * l_int32 saConcatenatePdfToData()
+ *
+ * The top-level multi-image functions can be visualized as follows:
+ * Output pdf data to file:
+ * convertToPdf() and convertImageDataToPdf()
+ * --> pixConvertToPdf()
+ * --> pixConvertToPdfData()
+ *
+ * Output pdf data to array in memory:
+ * convertToPdfData() and convertImageDataToPdfData()
+ * --> pixConvertToPdfData()
+ *
+ * The top-level segmented image functions can be visualized as follows:
+ * Output pdf data to file:
+ * convertToPdfSegmented()
+ * --> pixConvertToPdfSegmented()
+ * --> pixConvertToPdfDataSegmented()
+ *
+ * Output pdf data to array in memory:
+ * convertToPdfDataSegmented()
+ * --> pixConvertToPdfDataSegmented()
+ *
+ * For multi-page concatenation, there are three different types of input
+ * (1) directory and optional filename filter
+ * (2) sarray of filenames
+ * (3) ptra of byte arrays of pdf data
+ * and two types of output for the concatenated pdf data
+ * (1) filename
+ * (2) data array and size
+ * High-level interfaces are given for each of the six combinations.
+ *
+ * Note: When wrapping small images into pdf, it is useful to give
+ * them a relatively low resolution value, to avoid rounding errors
+ * when rendering the images. For example, if you want an image
+ * of width w pixels to be 5 inches wide on a screen, choose a
+ * resolution w/5.
+ *
+ * The very fast functions in section (2) require neither transcoding
+ * nor parsing of the compressed jpeg file. With three types of image
+ * compression, the compressed strings can be incorporated into
+ * the pdf data without decompression and re-encoding: jpeg, jp2k
+ * and png. The DCTDecode and JPXDecode filters can handle the
+ * entire jpeg and jp2k encoded string as a byte array in the pdf file.
+ * The FlateDecode filter can handle the png compressed image data,
+ * including predictors that occur as the first byte in each
+ * raster line, but it is necessary to store only the png IDAT chunk
+ * data in the pdf array. The alternative for wrapping png images
+ * is to uncompress into a raster (a pix) and then gzip the raster data.
+ * This typically results in a larger pdf file, because it doesn't
+ * use the two-dimensional png predictor. Colormaps, which are found
+ * in png PLTE chunks, must always be pulled out and included separately
+ * in the pdf. For CCITT-G4 compression, you can not simply
+ * include a tiff G4 file -- you must either parse it and extract the
+ * G4 compressed data within it, or uncompress to a raster and
+ * G4 compress again.
+ */
+
+#include <string.h>
+#include <math.h>
+#include "allheaders.h"
+
+/* --------------------------------------------*/
+#if USE_PDFIO /* defined in environ.h */
+ /* --------------------------------------------*/
+
+ /* Typical scan resolution in ppi (pixels/inch) */
+static const l_int32 DEFAULT_INPUT_RES = 300;
+
+
+/*---------------------------------------------------------------------*
+ * Convert specified image files to pdf (one image file per page) *
+ *---------------------------------------------------------------------*/
+/*!
+ * convertFilesToPdf()
+ *
+ * Input: directory name (containing images)
+ * substr (<optional> substring filter on filenames; can be NULL)
+ * res (input resolution of all images)
+ * scalefactor (scaling factor applied to each image; > 0.0)
+ * type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ * L_FLATE_ENCODE, or 0 for default)
+ * quality (used for JPEG only; 0 for default (75))
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * fileout (pdf file of all images)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @substr is not NULL, only image filenames that contain
+ * the substring can be used. If @substr == NULL, all files
+ * in the directory are used.
+ * (2) The files in the directory, after optional filtering by
+ * the substring, are lexically sorted in increasing order
+ * before concatenation.
+ * (3) The scalefactor is applied to each image before encoding.
+ * If you enter a value <= 0.0, it will be set to 1.0.
+ * (4) Specifying one of the three encoding types for @type forces
+ * all images to be compressed with that type. Use 0 to have
+ * the type determined for each image based on depth and whether
+ * or not it has a colormap.
+ */
+l_int32
+convertFilesToPdf(const char *dirname,
+ const char *substr,
+ l_int32 res,
+ l_float32 scalefactor,
+ l_int32 type,
+ l_int32 quality,
+ const char *title,
+ const char *fileout)
+{
+l_int32 ret;
+SARRAY *sa;
+
+ PROCNAME("convertFilesToPdf");
+
+ if (!dirname)
+ return ERROR_INT("dirname not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+
+ if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+ return ERROR_INT("sa not made", procName, 1);
+ ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
+ title, fileout);
+ sarrayDestroy(&sa);
+ return ret;
+}
+
+
+/*!
+ * saConvertFilesToPdf()
+ *
+ * Input: sarray (of pathnames for images)
+ * res (input resolution of all images)
+ * scalefactor (scaling factor applied to each image; > 0.0)
+ * type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ * L_FLATE_ENCODE, or 0 for default)
+ * quality (used for JPEG only; 0 for default (75))
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * fileout (pdf file of all images)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) See convertFilesToPdf().
+ */
+l_int32
+saConvertFilesToPdf(SARRAY *sa,
+ l_int32 res,
+ l_float32 scalefactor,
+ l_int32 type,
+ l_int32 quality,
+ const char *title,
+ const char *fileout)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("saConvertFilesToPdf");
+
+ if (!sa)
+ return ERROR_INT("sa not defined", procName, 1);
+
+ ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
+ title, &data, &nbytes);
+ if (ret) {
+ if (data) LEPT_FREE(data);
+ return ERROR_INT("pdf data not made", procName, 1);
+ }
+
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ if (ret)
+ L_ERROR("pdf data not written to file\n", procName);
+ return ret;
+}
+
+
+/*!
+ * saConvertFilesToPdfData()
+ *
+ * Input: sarray (of pathnames for images)
+ * res (input resolution of all images)
+ * scalefactor (scaling factor applied to each image; > 0.0)
+ * type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ * L_FLATE_ENCODE, or 0 for default)
+ * quality (used for JPEG only; 0 for default (75))
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * &data (<return> output pdf data (of all images)
+ * &nbytes (<return> size of output pdf data)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) See convertFilesToPdf().
+ */
+l_int32
+saConvertFilesToPdfData(SARRAY *sa,
+ l_int32 res,
+ l_float32 scalefactor,
+ l_int32 type,
+ l_int32 quality,
+ const char *title,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+char *fname;
+const char *pdftitle;
+l_uint8 *imdata;
+l_int32 i, n, ret, pagetype, npages, scaledres;
+size_t imbytes;
+L_BYTEA *ba;
+PIX *pixs, *pix;
+L_PTRA *pa_data;
+
+ PROCNAME("saConvertFilesToPdfData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!sa)
+ return ERROR_INT("sa not defined", procName, 1);
+ if (scalefactor <= 0.0) scalefactor = 1.0;
+ if (type < 0 || type > L_FLATE_ENCODE) {
+ L_WARNING("invalid compression type; using per-page default\n",
+ procName);
+ type = 0;
+ }
+
+ /* Generate all the encoded pdf strings */
+ n = sarrayGetCount(sa);
+ pa_data = ptraCreate(n);
+ pdftitle = NULL;
+ for (i = 0; i < n; i++) {
+ if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
+ fname = sarrayGetString(sa, i, L_NOCOPY);
+ if ((pixs = pixRead(fname)) == NULL) {
+ L_ERROR("image not readable from file %s\n", procName, fname);
+ continue;
+ }
+ if (!pdftitle)
+ pdftitle = (title) ? title : fname;
+ if (scalefactor != 1.0)
+ pix = pixScale(pixs, scalefactor, scalefactor);
+ else
+ pix = pixClone(pixs);
+ scaledres = (l_int32)(res * scalefactor);
+ if (type != 0) {
+ pagetype = type;
+ } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
+ L_ERROR("encoding type selection failed for file %s\n",
+ procName, fname);
+ continue;
+ }
+ ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
+ 0, 0, scaledres, pdftitle, NULL, 0);
+ pixDestroy(&pix);
+ pixDestroy(&pixs);
+ if (ret) {
+ L_ERROR("pdf encoding failed for %s\n", procName, fname);
+ continue;
+ }
+ ba = l_byteaInitFromMem(imdata, imbytes);
+ if (imdata) LEPT_FREE(imdata);
+ ptraAdd(pa_data, ba);
+ }
+ ptraGetActualCount(pa_data, &npages);
+ if (npages == 0) {
+ L_ERROR("no pdf files made\n", procName);
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return 1;
+ }
+
+ /* Concatenate them */
+ fprintf(stderr, "\nconcatenating ... ");
+ ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+ fprintf(stderr, "done\n");
+
+ ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */
+ for (i = 0; i < npages; i++) {
+ ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+ l_byteaDestroy(&ba);
+ }
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return ret;
+}
+
+
+/*!
+ * selectDefaultPdfEncoding()
+ *
+ * Input: pix
+ * &type (<return> L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ *
+ * Notes:
+ * (1) This attempts to choose an encoding for the pix that results
+ * in the smallest file, assuming that if jpeg encoded, it will
+ * use quality = 75. The decision is approximate, in that
+ * (a) all colormapped images will be losslessly encoded with
+ * gzip (flate), and (b) an image with less than about 20 colors
+ * is likely to be smaller if flate encoded than if encoded
+ * as a jpeg (dct). For example, an image made by pixScaleToGray3()
+ * will have 10 colors, and flate encoding will give about
+ * twice the compression as jpeg with quality = 75.
+ */
+l_int32
+selectDefaultPdfEncoding(PIX *pix,
+ l_int32 *ptype)
+{
+l_int32 w, h, d, factor, ncolors;
+PIXCMAP *cmap;
+
+ PROCNAME("selectDefaultPdfEncoding");
+
+ if (!pix)
+ return ERROR_INT("pix not defined", procName, 1);
+ if (!ptype)
+ return ERROR_INT("&type not defined", procName, 1);
+ *ptype = L_FLATE_ENCODE; /* default universal encoding */
+ pixGetDimensions(pix, &w, &h, &d);
+ cmap = pixGetColormap(pix);
+ if (d == 8 && !cmap) {
+ factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
+ pixNumColors(pix, factor, &ncolors);
+ if (ncolors < 20)
+ *ptype = L_FLATE_ENCODE;
+ else
+ *ptype = L_JPEG_ENCODE;
+ } else if (d == 1) {
+ *ptype = L_G4_ENCODE;
+ } else if (cmap || d == 2 || d == 4) {
+ *ptype = L_FLATE_ENCODE;
+ } else if (d == 8 || d == 32) {
+ *ptype = L_JPEG_ENCODE;
+ } else {
+ return ERROR_INT("type selection failure", procName, 1);
+ }
+
+ return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ * Convert specified image files to pdf without scaling *
+ *---------------------------------------------------------------------*/
+/*!
+ * convertUnscaledFilesToPdf()
+ *
+ * Input: directory name (containing images)
+ * substr (<optional> substring filter on filenames; can be NULL)
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * fileout (pdf file of all images)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @substr is not NULL, only image filenames that contain
+ * the substring can be used. If @substr == NULL, all files
+ * in the directory are used.
+ * (2) The files in the directory, after optional filtering by
+ * the substring, are lexically sorted in increasing order
+ * before concatenation.
+ * (3) For jpeg and jp2k, this is very fast because the compressed
+ * data is wrapped up and concatenated. For png and tiffg4,
+ * the images must be read and recompressed.
+ */
+l_int32
+convertUnscaledFilesToPdf(const char *dirname,
+ const char *substr,
+ const char *title,
+ const char *fileout)
+{
+l_int32 ret;
+SARRAY *sa;
+
+ PROCNAME("convertUnscaledFilesToPdf");
+
+ if (!dirname)
+ return ERROR_INT("dirname not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+
+ if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+ return ERROR_INT("sa not made", procName, 1);
+ ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
+ sarrayDestroy(&sa);
+ return ret;
+}
+
+
+/*!
+ * saConvertUnscaledFilesToPdf()
+ *
+ * Input: sarray (of pathnames for images)
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * fileout (pdf file of all images)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) See convertUnscaledFilesToPdf().
+ */
+l_int32
+saConvertUnscaledFilesToPdf(SARRAY *sa,
+ const char *title,
+ const char *fileout)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("saConvertUnscaledFilesToPdf");
+
+ if (!sa)
+ return ERROR_INT("sa not defined", procName, 1);
+
+ ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
+ if (ret) {
+ if (data) LEPT_FREE(data);
+ return ERROR_INT("pdf data not made", procName, 1);
+ }
+
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ if (ret)
+ L_ERROR("pdf data not written to file\n", procName);
+ return ret;
+}
+
+
+/*!
+ * saConvertUnscaledFilesToPdfData()
+ *
+ * Input: sarray (of pathnames for images)
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * &data (<return> output pdf data (of all images)
+ * &nbytes (<return> size of output pdf data)
+ * Return: 0 if OK, 1 on error
+ */
+l_int32
+saConvertUnscaledFilesToPdfData(SARRAY *sa,
+ const char *title,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+char *fname;
+l_uint8 *imdata;
+l_int32 i, n, ret, npages;
+size_t imbytes;
+L_BYTEA *ba;
+L_PTRA *pa_data;
+
+ PROCNAME("saConvertUnscaledFilesToPdfData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!sa)
+ return ERROR_INT("sa not defined", procName, 1);
+
+ /* Generate all the encoded pdf strings */
+ n = sarrayGetCount(sa);
+ pa_data = ptraCreate(n);
+ for (i = 0; i < n; i++) {
+ if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
+ fname = sarrayGetString(sa, i, L_NOCOPY);
+
+ /* Generate the pdf data */
+ if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
+ continue;
+
+ /* ... and add it to the array of single page data */
+ ba = l_byteaInitFromMem(imdata, imbytes);
+ if (imdata) LEPT_FREE(imdata);
+ ptraAdd(pa_data, ba);
+ }
+ ptraGetActualCount(pa_data, &npages);
+ if (npages == 0) {
+ L_ERROR("no pdf files made\n", procName);
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return 1;
+ }
+
+ /* Concatenate to generate a multipage pdf */
+ fprintf(stderr, "\nconcatenating ... ");
+ ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+ fprintf(stderr, "done\n");
+
+ /* Clean up */
+ ptraGetActualCount(pa_data, &npages); /* maybe failed to read some files */
+ for (i = 0; i < npages; i++) {
+ ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+ l_byteaDestroy(&ba);
+ }
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return ret;
+}
+
+
+/*!
+ * convertUnscaledToPdfData()
+ *
+ * Input: fname (of image file)
+ * title (<optional> pdf title; can be NULL)
+ * &data (<return> output pdf data for image)
+ * &nbytes (<return> size of output pdf data)
+ * Return: 0 if OK, 1 on error
+ */
+l_int32
+convertUnscaledToPdfData(const char *fname,
+ const char *title,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+const char *pdftitle = NULL;
+char *tail = NULL;
+l_int32 format;
+L_COMP_DATA *cid;
+
+ PROCNAME("convertUnscaledToPdfData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!fname)
+ return ERROR_INT("fname not defined", procName, 1);
+
+ findFileFormat(fname, &format);
+ if (format == IFF_UNKNOWN) {
+ L_WARNING("file %s format is unknown; skip\n", procName, fname);
+ return 1;
+ }
+ if (format == IFF_PS || format == IFF_LPDF) {
+ L_WARNING("file %s format is %d; skip\n", procName, fname, format);
+ return 1;
+ }
+
+ /* Generate the image data required for pdf generation, always
+ * in binary (not ascii85) coding; jpeg files are never transcoded. */
+ l_generateCIDataForPdf(fname, NULL, 0, &cid);
+ if (!cid) {
+ L_ERROR("file %s format is %d; unreadable\n", procName, fname, format);
+ return 1;
+ }
+
+ /* If @title == NULL, use the tail of @fname. */
+ if (title) {
+ pdftitle = title;
+ } else {
+ splitPathAtDirectory(fname, NULL, &tail);
+ pdftitle = tail;
+ }
+
+ /* Generate the pdf string for this page (image). This destroys
+ * the cid by attaching it to an lpd and destroying the lpd. */
+ cidConvertToPdfData(cid, pdftitle, pdata, pnbytes);
+ LEPT_FREE(tail);
+ return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ * Convert multiple images to pdf (one image per page) *
+ *---------------------------------------------------------------------*/
+/*!
+ * pixaConvertToPdf()
+ *
+ * Input: pixa (containing images all at the same resolution)
+ * res (override the resolution of each input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * scalefactor (scaling factor applied to each image; > 0.0)
+ * type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ * L_FLATE_ENCODE, or 0 for default)
+ * quality (used for JPEG only; 0 for default (75))
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * fileout (pdf file of all images)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
+ * colormap and many colors, or 32 bpp; FLATE for anything else.
+ * (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
+ * (3) Specifying one of the three encoding types for @type forces
+ * all images to be compressed with that type. Use 0 to have
+ * the type determined for each image based on depth and whether
+ * or not it has a colormap.
+ */
+l_int32
+pixaConvertToPdf(PIXA *pixa,
+ l_int32 res,
+ l_float32 scalefactor,
+ l_int32 type,
+ l_int32 quality,
+ const char *title,
+ const char *fileout)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("pixaConvertToPdf");
+
+ if (!pixa)
+ return ERROR_INT("pixa not defined", procName, 1);
+
+ ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
+ title, &data, &nbytes);
+ if (ret) {
+ LEPT_FREE(data);
+ return ERROR_INT("conversion to pdf failed", procName, 1);
+ }
+
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ if (ret)
+ L_ERROR("pdf data not written to file\n", procName);
+ return ret;
+}
+
+
+/*!
+ * pixaConvertToPdfData()
+ *
+ * Input: pixa (containing images all at the same resolution)
+ * res (input resolution of all images)
+ * scalefactor (scaling factor applied to each image; > 0.0)
+ * type (encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
+ * L_FLATE_ENCODE, or 0 for default)
+ * quality (used for JPEG only; 0 for default (75))
+ * title (<optional> pdf title)
+ * &data (<return> output pdf data (of all images)
+ * &nbytes (<return> size of output pdf data)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) See pixaConvertToPdf().
+ */
+l_int32
+pixaConvertToPdfData(PIXA *pixa,
+ l_int32 res,
+ l_float32 scalefactor,
+ l_int32 type,
+ l_int32 quality,
+ const char *title,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+l_uint8 *imdata;
+l_int32 i, n, ret, scaledres, pagetype;
+size_t imbytes;
+L_BYTEA *ba;
+PIX *pixs, *pix;
+L_PTRA *pa_data;
+
+ PROCNAME("pixaConvertToPdfData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!pixa)
+ return ERROR_INT("pixa not defined", procName, 1);
+ if (scalefactor <= 0.0) scalefactor = 1.0;
+ if (type < 0 || type > L_FLATE_ENCODE) {
+ L_WARNING("invalid compression type; using per-page default\n",
+ procName);
+ type = 0;
+ }
+
+ /* Generate all the encoded pdf strings */
+ n = pixaGetCount(pixa);
+ pa_data = ptraCreate(n);
+ for (i = 0; i < n; i++) {
+ if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
+ L_ERROR("pix[%d] not retrieved\n", procName, i);
+ continue;
+ }
+ if (scalefactor != 1.0)
+ pix = pixScale(pixs, scalefactor, scalefactor);
+ else
+ pix = pixClone(pixs);
+ pixDestroy(&pixs);
+ scaledres = (l_int32)(res * scalefactor);
+ if (type != 0) {
+ pagetype = type;
+ } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
+ L_ERROR("encoding type selection failed for pix[%d]\n",
+ procName, i);
+ pixDestroy(&pix);
+ continue;
+ }
+ ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
+ 0, 0, scaledres, title, NULL, 0);
+ pixDestroy(&pix);
+ if (ret) {
+ L_ERROR("pdf encoding failed for pix[%d]\n", procName, i);
+ continue;
+ }
+ ba = l_byteaInitFromMem(imdata, imbytes);
+ if (imdata) LEPT_FREE(imdata);
+ ptraAdd(pa_data, ba);
+ }
+ ptraGetActualCount(pa_data, &n);
+ if (n == 0) {
+ L_ERROR("no pdf files made\n", procName);
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return 1;
+ }
+
+ /* Concatenate them */
+ ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
+
+ ptraGetActualCount(pa_data, &n); /* recalculate in case it changes */
+ for (i = 0; i < n; i++) {
+ ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+ l_byteaDestroy(&ba);
+ }
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return ret;
+}
+
+
+/*---------------------------------------------------------------------*
+ * Single page, multi-image converters *
+ *---------------------------------------------------------------------*/
+/*!
+ * convertToPdf()
+ *
+ * Input: filein (input image file -- any format)
+ * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ * quality (used for JPEG only; 0 for default (75))
+ * fileout (output pdf file; only required on last image on page)
+ * x, y (location of lower-left corner of image, in pixels,
+ * relative to the PostScript origin (0,0) at
+ * the lower-left corner of the page)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title; if null, taken from filein)
+ * &lpd (ptr to lpd, which is created on the first invocation
+ * and returned until last image is processed, at which
+ * time it is destroyed)
+ * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ * L_LAST_IMAGE)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) To wrap only one image in pdf, input @plpd = NULL, and
+ * the value of @position will be ignored:
+ * convertToPdf(... type, quality, x, y, res, NULL, 0);
+ * (2) To wrap multiple images on a single pdf page, this is called
+ * once for each successive image. Do it this way:
+ * L_PDF_DATA *lpd;
+ * convertToPdf(... type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
+ * convertToPdf(... type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
+ * ...
+ * convertToPdf(... type, quality, x, y, res, &lpd, L_LAST_IMAGE);
+ * This will write the result to the value of @fileout specified
+ * in the first call; succeeding values of @fileout are ignored.
+ * On the last call: the pdf data bytes are computed and written
+ * to @fileout, lpd is destroyed internally, and the returned
+ * value of lpd is null. So the client has nothing to clean up.
+ * (3) (a) Set @res == 0 to respect the resolution embedded in the
+ * image file. If no resolution is embedded, it will be set
+ * to the default value.
+ * (b) Set @res to some other value to override the file resolution.
+ * (4) (a) If the input @res and the resolution of the output device
+ * are equal, the image will be "displayed" at the same size
+ * as the original.
+ * (b) If the input @res is 72, the output device will render
+ * the image at 1 pt/pixel.
+ * (c) Some possible choices for the default input pix resolution are:
+ * 72 ppi Render pix on any output device at one pt/pixel
+ * 96 ppi Windows default for generated display images
+ * 300 ppi Typical default for scanned images.
+ * We choose 300, which is sensible for rendering page images.
+ * However, images come from a variety of sources, and
+ * some are explicitly created for viewing on a display.
+ */
+l_int32
+convertToPdf(const char *filein,
+ l_int32 type,
+ l_int32 quality,
+ const char *fileout,
+ l_int32 x,
+ l_int32 y,
+ l_int32 res,
+ const char *title,
+ L_PDF_DATA **plpd,
+ l_int32 position)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("convertToPdf");
+
+ if (!filein)
+ return ERROR_INT("filein not defined", procName, 1);
+ if (!plpd || (position == L_LAST_IMAGE)) {
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+ }
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+
+ if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
+ res, title, plpd, position))
+ return ERROR_INT("pdf data not made", procName, 1);
+
+ if (!plpd || (position == L_LAST_IMAGE)) {
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ if (ret)
+ return ERROR_INT("pdf data not written to file", procName, 1);
+ }
+
+ return 0;
+}
+
+
+/*!
+ * convertImageDataToPdf()
+ *
+ * Input: imdata (array of formatted image data; e.g., png, jpeg)
+ * size (size of image data)
+ * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ * quality (used for JPEG only; 0 for default (75))
+ * fileout (output pdf file; only required on last image on page)
+ * x, y (location of lower-left corner of image, in pixels,
+ * relative to the PostScript origin (0,0) at
+ * the lower-left corner of the page)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title)
+ * &lpd (ptr to lpd, which is created on the first invocation
+ * and returned until last image is processed, at which
+ * time it is destroyed)
+ * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ * L_LAST_IMAGE)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @res == 0 and the input resolution field is 0,
+ * this will use DEFAULT_INPUT_RES.
+ * (2) See comments in convertToPdf().
+ */
+l_int32
+convertImageDataToPdf(l_uint8 *imdata,
+ size_t size,
+ l_int32 type,
+ l_int32 quality,
+ const char *fileout,
+ l_int32 x,
+ l_int32 y,
+ l_int32 res,
+ const char *title,
+ L_PDF_DATA **plpd,
+ l_int32 position)
+{
+l_int32 ret;
+PIX *pix;
+
+ PROCNAME("convertImageDataToPdf");
+
+ if (!imdata)
+ return ERROR_INT("image data not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+ if (!plpd || (position == L_LAST_IMAGE)) {
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+ }
+
+ if ((pix = pixReadMem(imdata, size)) == NULL)
+ return ERROR_INT("pix not read", procName, 1);
+ ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
+ title, plpd, position);
+ pixDestroy(&pix);
+ return ret;
+}
+
+
+/*!
+ * convertToPdfData()
+ *
+ * Input: filein (input image file -- any format)
+ * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ * quality (used for JPEG only; 0 for default (75))
+ * &data (<return> pdf data in memory)
+ * &nbytes (<return> number of bytes in pdf data)
+ * x, y (location of lower-left corner of image, in pixels,
+ * relative to the PostScript origin (0,0) at
+ * the lower-left corner of the page)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title; if null, use filein)
+ * &lpd (ptr to lpd, which is created on the first invocation
+ * and returned until last image is processed, at which
+ * time it is destroyed)
+ * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ * L_LAST_IMAGE)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @res == 0 and the input resolution field is 0,
+ * this will use DEFAULT_INPUT_RES.
+ * (2) See comments in convertToPdf().
+ */
+l_int32
+convertToPdfData(const char *filein,
+ l_int32 type,
+ l_int32 quality,
+ l_uint8 **pdata,
+ size_t *pnbytes,
+ l_int32 x,
+ l_int32 y,
+ l_int32 res,
+ const char *title,
+ L_PDF_DATA **plpd,
+ l_int32 position)
+{
+PIX *pix;
+
+ PROCNAME("convertToPdfData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!filein)
+ return ERROR_INT("filein not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+
+ if ((pix = pixRead(filein)) == NULL)
+ return ERROR_INT("pix not made", procName, 1);
+
+ pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
+ x, y, res, (title) ? title : filein, plpd, position);
+ pixDestroy(&pix);
+ return 0;
+}
+
+
+/*!
+ * convertImageDataToPdfData()
+ *
+ * Input: imdata (array of formatted image data; e.g., png, jpeg)
+ * size (size of image data)
+ * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ * quality (used for JPEG only; 0 for default (75))
+ * &data (<return> pdf data in memory)
+ * &nbytes (<return> number of bytes in pdf data)
+ * x, y (location of lower-left corner of image, in pixels,
+ * relative to the PostScript origin (0,0) at
+ * the lower-left corner of the page)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title)
+ * &lpd (ptr to lpd, which is created on the first invocation
+ * and returned until last image is processed, at which
+ * time it is destroyed)
+ * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ * L_LAST_IMAGE)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @res == 0 and the input resolution field is 0,
+ * this will use DEFAULT_INPUT_RES.
+ * (2) See comments in convertToPdf().
+ */
+l_int32
+convertImageDataToPdfData(l_uint8 *imdata,
+ size_t size,
+ l_int32 type,
+ l_int32 quality,
+ l_uint8 **pdata,
+ size_t *pnbytes,
+ l_int32 x,
+ l_int32 y,
+ l_int32 res,
+ const char *title,
+ L_PDF_DATA **plpd,
+ l_int32 position)
+{
+l_int32 ret;
+PIX *pix;
+
+ PROCNAME("convertImageDataToPdfData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!imdata)
+ return ERROR_INT("image data not defined", procName, 1);
+ if (plpd) { /* part of multi-page invocation */
+ if (position == L_FIRST_IMAGE)
+ *plpd = NULL;
+ }
+
+ if ((pix = pixReadMem(imdata, size)) == NULL)
+ return ERROR_INT("pix not read", procName, 1);
+ ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
+ x, y, res, title, plpd, position);
+ pixDestroy(&pix);
+ return ret;
+}
+
+
+/*!
+ * pixConvertToPdf()
+ *
+ * Input: pix
+ * type (L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE)
+ * quality (used for JPEG only; 0 for default (75))
+ * fileout (output pdf file; only required on last image on page)
+ * x, y (location of lower-left corner of image, in pixels,
+ * relative to the PostScript origin (0,0) at
+ * the lower-left corner of the page)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title)
+ * &lpd (ptr to lpd, which is created on the first invocation
+ * and returned until last image is processed)
+ * position (in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
+ * L_LAST_IMAGE)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @res == 0 and the input resolution field is 0,
+ * this will use DEFAULT_INPUT_RES.
+ * (2) This only writes data to fileout if it is the last
+ * image to be written on the page.
+ * (3) See comments in convertToPdf().
+ */
+l_int32
+pixConvertToPdf(PIX *pix,
+ l_int32 type,
+ l_int32 quality,
+ const char *fileout,
+ l_int32 x,
+ l_int32 y,
+ l_int32 res,
+ const char *title,
+ L_PDF_DATA **plpd,
+ l_int32 position)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("pixConvertToPdf");
+
+ if (!pix)
+ return ERROR_INT("pix not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+ if (!plpd || (position == L_LAST_IMAGE)) {
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+ }
+
+ if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
+ x, y, res, title, plpd, position))
+ return ERROR_INT("pdf data not made", procName, 1);
+
+ if (!plpd || (position == L_LAST_IMAGE)) {
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ if (ret)
+ return ERROR_INT("pdf data not written to file", procName, 1);
+ }
+ return 0;
+}
+
+
+/*!
+ * pixWriteStreamPdf()
+ *
+ * Input: fp (stream opened for writing)
+ * pix (all depths, cmap OK)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title; taken from the first image
+ * placed on a page; e.g., an input image filename)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This is the simplest interface for writing a single image
+ * with pdf encoding to a stream. It uses G4 encoding for 1 bpp,
+ * JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
+ * encoding for everything else.
+ */
+l_int32
+pixWriteStreamPdf(FILE *fp,
+ PIX *pix,
+ l_int32 res,
+ const char *title)
+{
+l_uint8 *data;
+size_t nbytes, nbytes_written;
+
+ PROCNAME("pixWriteStreamPdf");
+
+ if (!fp)
+ return ERROR_INT("stream not opened", procName, 1);
+ if (!pix)
+ return ERROR_INT("pix not defined", procName, 1);
+
+ if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0)
+ return ERROR_INT("pdf data not made", procName, 1);
+
+ nbytes_written = fwrite(data, 1, nbytes, fp);
+ LEPT_FREE(data);
+ if (nbytes != nbytes_written)
+ return ERROR_INT("failure writing pdf data to stream", procName, 1);
+ return 0;
+}
+
+
+/*!
+ * pixWriteMemPdf()
+ *
+ * Input: &data (<return> pdf as byte array)
+ * &nbytes (<return> number of bytes in pdf array)
+ * pix (all depths, cmap OK)
+ * res (override the resolution of the input image, in ppi;
+ * use 0 to respect the resolution embedded in the input)
+ * title (<optional> pdf title; taken from the first image
+ * placed on a page; e.g., an input image filename)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This is the simplest interface for writing a single image
+ * with pdf encoding to memory. It uses G4 encoding for 1 bpp,
+ * JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
+ * encoding for everything else.
+ */
+l_int32
+pixWriteMemPdf(l_uint8 **pdata,
+ size_t *pnbytes,
+ PIX *pix,
+ l_int32 res,
+ const char *title)
+{
+l_int32 ret, d, type;
+PIXCMAP *cmap;
+
+ PROCNAME("pixWriteMemPdf");
+
+ if (pdata) *pdata = NULL;
+ if (pnbytes) *pnbytes = 0;
+ if (!pdata || !pnbytes)
+ return ERROR_INT("&data or &nbytes not defined", procName, 1);
+ if (!pix)
+ return ERROR_INT("pix not defined", procName, 1);
+
+ d = pixGetDepth(pix);
+ cmap = pixGetColormap(pix);
+ if (d == 1)
+ type = L_G4_ENCODE;
+ else if (cmap || d == 2 || d == 4 || d == 16)
+ type = L_FLATE_ENCODE;
+ else /* d == 8 (no cmap) or d == 32 */
+ type = L_JPEG_ENCODE;
+
+ ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
+ 0, 0, res, title, NULL, 0);
+ if (ret)
+ return ERROR_INT("pdf data not made", procName, 1);
+ return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ * Segmented multi-page, multi-image converter *
+ *---------------------------------------------------------------------*/
+/*!
+ * convertSegmentedFilesToPdf()
+ *
+ * Input: directory name (containing images)
+ * substr (<optional> substring filter on filenames; can be NULL)
+ * res (input resolution of all images)
+ * type (compression type for non-image regions; the
+ * image regions are always compressed with L_JPEG_ENCODE)
+ * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ * boxaa (<optional> of image regions)
+ * quality (used for JPEG only; 0 for default (75))
+ * scalefactor (scaling factor applied to each image region)
+ * title (<optional> pdf title; if null, taken from the first
+ * image filename)
+ * fileout (pdf file of all images)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If @substr is not NULL, only image filenames that contain
+ * the substring can be used. If @substr == NULL, all files
+ * in the directory are used.
+ * (2) The files in the directory, after optional filtering by
+ * the substring, are lexically sorted in increasing order
+ * before concatenation.
+ * (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
+ * colormap and many colors, or 32 bpp; FLATE for anything else.
+ * (4) The boxaa, if it exists, contains one boxa of "image regions"
+ * for each image file. The boxa must be aligned with the
+ * sorted set of images.
+ * (5) The scalefactor is applied to each image region. It is
+ * typically < 1.0, to save bytes in the final pdf, because
+ * the resolution is often not critical in non-text regions.
+ * (6) If the non-image regions have pixel depth > 1 and the encoding
+ * type is G4, they are automatically scaled up by 2x and
+ * thresholded. Otherwise, no scaling is performed on them.
+ * (7) Note that this function can be used to generate multipage
+ * G4 compressed pdf from any input, by using @boxaa == NULL
+ * and @type == L_G4_ENCODE.
+ */
+l_int32
+convertSegmentedFilesToPdf(const char *dirname,
+ const char *substr,
+ l_int32 res,
+ l_int32 type,
+ l_int32 thresh,
+ BOXAA *baa,
+ l_int32 quality,
+ l_float32 scalefactor,
+ const char *title,
+ const char *fileout)
+{
+char *fname;
+l_uint8 *imdata, *data;
+l_int32 i, npages, nboxa, nboxes, ret;
+size_t imbytes, databytes;
+BOXA *boxa;
+L_BYTEA *ba;
+L_PTRA *pa_data;
+SARRAY *sa;
+
+ PROCNAME("convertSegmentedFilesToPdf");
+
+ if (!dirname)
+ return ERROR_INT("dirname not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+
+ if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
+ == NULL)
+ return ERROR_INT("sa not made", procName, 1);
+
+ npages = sarrayGetCount(sa);
+ /* If necessary, extend the boxaa, which is page-aligned with
+ * the image files, to be as large as the set of images. */
+ if (baa) {
+ nboxa = boxaaGetCount(baa);
+ if (nboxa < npages) {
+ boxa = boxaCreate(1);
+ boxaaExtendWithInit(baa, npages, boxa);
+ boxaDestroy(&boxa);
+ }
+ }
+
+ /* Generate and save all the encoded pdf strings */
+ pa_data = ptraCreate(npages);
+ for (i = 0; i < npages; i++) {
+ fname = sarrayGetString(sa, i, L_NOCOPY);
+ if (!strcmp(fname, "")) continue;
+ boxa = NULL;
+ if (baa) {
+ boxa = boxaaGetBoxa(baa, i, L_CLONE);
+ nboxes = boxaGetCount(boxa);
+ if (nboxes == 0)
+ boxaDestroy(&boxa);
+ }
+ ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
+ quality, scalefactor, title,
+ &imdata, &imbytes);
+ boxaDestroy(&boxa); /* safe; in case nboxes > 0 */
+ if (ret) {
+ L_ERROR("pdf encoding failed for %s\n", procName, fname);
+ continue;
+ }
+ ba = l_byteaInitFromMem(imdata, imbytes);
+ if (imdata) LEPT_FREE(imdata);
+ ptraAdd(pa_data, ba);
+ }
+ sarrayDestroy(&sa);
+
+ ptraGetActualCount(pa_data, &npages);
+ if (npages == 0) {
+ L_ERROR("no pdf files made\n", procName);
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return 1;
+ }
+
+ /* Concatenate */
+ ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
+
+ /* Clean up */
+ ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */
+ for (i = 0; i < npages; i++) {
+ ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+ l_byteaDestroy(&ba);
+ }
+ ptraDestroy(&pa_data, FALSE, FALSE);
+
+ if (ret) {
+ if (data) LEPT_FREE(data);
+ return ERROR_INT("pdf data not made", procName, 1);
+ }
+
+ ret = l_binaryWrite(fileout, "w", data, databytes);
+ LEPT_FREE(data);
+ if (ret)
+ L_ERROR("pdf data not written to file\n", procName);
+ return ret;
+}
+
+
+/*!
+ * convertNumberedMasksToBoxaa()
+ *
+ * Input: directory name (containing mask images)
+ * substr (<optional> substring filter on filenames; can be NULL)
+ * numpre (number of characters in name before number)
+ * numpost (number of characters in name after number, up
+ * to a dot before an extension)
+ * including an extension and the dot separator)
+ * Return: boxaa of mask regions, or null on error
+ *
+ * Notes:
+ * (1) This is conveniently used to generate the input boxaa
+ * for convertSegmentedFilesToPdf(). It guarantees that the
+ * boxa will be aligned with the page images, even if some
+ * of the boxa are empty.
+ */
+BOXAA *
+convertNumberedMasksToBoxaa(const char *dirname,
+ const char *substr,
+ l_int32 numpre,
+ l_int32 numpost)
+{
+char *fname;
+l_int32 i, n;
+BOXA *boxa;
+BOXAA *baa;
+PIX *pix;
+SARRAY *sa;
+
+ PROCNAME("convertNumberedMasksToBoxaa");
+
+ if (!dirname)
+ return (BOXAA *)ERROR_PTR("dirname not defined", procName, NULL);
+
+ if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
+ numpost, 10000)) == NULL)
+ return (BOXAA *)ERROR_PTR("sa not made", procName, NULL);
+
+ /* Generate and save all the encoded pdf strings */
+ n = sarrayGetCount(sa);
+ baa = boxaaCreate(n);
+ boxa = boxaCreate(1);
+ boxaaInitFull(baa, boxa);
+ boxaDestroy(&boxa);
+ for (i = 0; i < n; i++) {
+ fname = sarrayGetString(sa, i, L_NOCOPY);
+ if (!strcmp(fname, "")) continue;
+ if ((pix = pixRead(fname)) == NULL) {
+ L_WARNING("invalid image on page %d\n", procName, i);
+ continue;
+ }
+ boxa = pixConnComp(pix, NULL, 8);
+ boxaaReplaceBoxa(baa, i, boxa);
+ pixDestroy(&pix);
+ }
+
+ sarrayDestroy(&sa);
+ return baa;
+}
+
+
+/*---------------------------------------------------------------------*
+ * Segmented single page, multi-image converters *
+ *---------------------------------------------------------------------*/
+/*!
+ * convertToPdfSegmented()
+ *
+ * Input: filein (input image file -- any format)
+ * res (input image resolution; typ. 300 ppi; use 0 for default)
+ * type (compression type for non-image regions; the
+ * image regions are always compressed with L_JPEG_ENCODE)
+ * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ * boxa (<optional> of image regions; can be null)
+ * quality (used for jpeg image regions; 0 for default)
+ * scalefactor (used for jpeg regions; must be <= 1.0)
+ * title (<optional> pdf title; typically taken from the
+ * input file for the pix)
+ * fileout (output pdf file)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If there are no image regions, set @boxa == NULL;
+ * @quality and @scalefactor are ignored.
+ * (2) Typically, @scalefactor is < 1.0, because the image regions
+ * can be rendered at a lower resolution (for better compression)
+ * than the text regions. If @scalefactor == 0, we use 1.0.
+ * If the input image is 1 bpp and scalefactor < 1.0, we
+ * use scaleToGray() to downsample the image regions to gray
+ * before compressing them.
+ * (3) If the compression type for non-image regions is L_G4_ENCODE
+ * and bpp > 1, the image is upscaled 2x and thresholded
+ * to 1 bpp. That is the only situation where @thresh is used.
+ * (4) The parameter @quality is only used for image regions.
+ * If @type == L_JPEG_ENCODE, default jpeg quality (75) is
+ * used for the non-image regions.
+ * (5) Processing matrix for non-image regions.
+ *
+ * Input G4 JPEG FLATE
+ * ----------|---------------------------------------------------
+ * 1 bpp | 1x, 1 bpp 1x flate, 1 bpp 1x, 1 bpp
+ * |
+ * cmap | 2x, 1 bpp 1x flate, cmap 1x, cmap
+ * |
+ * 2,4 bpp | 2x, 1 bpp 1x flate 1x, 2,4 bpp
+ * no cmap | 2,4 bpp
+ * |
+ * 8,32 bpp | 2x, 1 bpp 1x (jpeg) 1x, 8,32 bpp
+ * no cmap | 8,32 bpp
+ *
+ * Summary:
+ * (a) if G4 is requested, G4 is used, with 2x upscaling
+ * for all cases except 1 bpp.
+ * (b) if JPEG is requested, use flate encoding for all cases
+ * except 8 bpp without cmap and 32 bpp (rgb).
+ * (c) if FLATE is requested, use flate with no transformation
+ * of the raster data.
+ * (6) Calling options/sequence for these functions:
+ * file --> file (convertToPdfSegmented)
+ * pix --> file (pixConvertToPdfSegmented)
+ * pix --> data (pixConvertToPdfDataSegmented)
+ * file --> data (convertToPdfDataSegmented)
+ * pix --> data (pixConvertToPdfDataSegmented)
+ */
+l_int32
+convertToPdfSegmented(const char *filein,
+ l_int32 res,
+ l_int32 type,
+ l_int32 thresh,
+ BOXA *boxa,
+ l_int32 quality,
+ l_float32 scalefactor,
+ const char *title,
+ const char *fileout)
+{
+l_int32 ret;
+PIX *pixs;
+
+ PROCNAME("convertToPdfSegmented");
+
+ if (!filein)
+ return ERROR_INT("filein not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+ if (boxa && scalefactor > 1.0) {
+ L_WARNING("setting scalefactor to 1.0\n", procName);
+ scalefactor = 1.0;
+ }
+
+ if ((pixs = pixRead(filein)) == NULL)
+ return ERROR_INT("pixs not made", procName, 1);
+
+ ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
+ scalefactor, (title) ? title : filein,
+ fileout);
+ pixDestroy(&pixs);
+ return ret;
+}
+
+
+/*!
+ * pixConvertToPdfSegmented()
+ *
+ * Input: pixs (any depth, cmap OK)
+ * res (input image resolution; typ. 300 ppi; use 0 for default)
+ * type (compression type for non-image regions; the
+ * image regions are always compressed with L_JPEG_ENCODE)
+ * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ * boxa (<optional> of image regions; can be null)
+ * quality (used for jpeg image regions; 0 for default)
+ * scalefactor (used for jpeg regions; must be <= 1.0)
+ * title (<optional> pdf title; typically taken from the
+ * input file for the pix)
+ * fileout (output pdf file)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) See convertToPdfSegmented() for details.
+ */
+l_int32
+pixConvertToPdfSegmented(PIX *pixs,
+ l_int32 res,
+ l_int32 type,
+ l_int32 thresh,
+ BOXA *boxa,
+ l_int32 quality,
+ l_float32 scalefactor,
+ const char *title,
+ const char *fileout)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("pixConvertToPdfSegmented");
+
+ if (!pixs)
+ return ERROR_INT("pixs not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+ if (boxa && scalefactor > 1.0) {
+ L_WARNING("setting scalefactor to 1.0\n", procName);
+ scalefactor = 1.0;
+ }
+
+ ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
+ scalefactor, title, &data, &nbytes);
+ if (ret)
+ return ERROR_INT("pdf generation failure", procName, 1);
+
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ if (data) LEPT_FREE(data);
+ return ret;
+}
+
+
+/*!
+ * convertToPdfDataSegmented()
+ *
+ * Input: filein (input image file -- any format)
+ * res (input image resolution; typ. 300 ppi; use 0 for default)
+ * type (compression type for non-image regions; the
+ * image regions are always compressed with L_JPEG_ENCODE)
+ * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ * boxa (<optional> image regions; can be null)
+ * quality (used for jpeg image regions; 0 for default)
+ * scalefactor (used for jpeg regions; must be <= 1.0)
+ * title (<optional> pdf title; if null, uses filein)
+ * &data (<return> pdf data in memory)
+ * &nbytes (<return> number of bytes in pdf data)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) If there are no image regions, set @boxa == NULL;
+ * @quality and @scalefactor are ignored.
+ * (2) Typically, @scalefactor is < 1.0. The image regions are
+ */
+l_int32
+convertToPdfDataSegmented(const char *filein,
+ l_int32 res,
+ l_int32 type,
+ l_int32 thresh,
+ BOXA *boxa,
+ l_int32 quality,
+ l_float32 scalefactor,
+ const char *title,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+l_int32 ret;
+PIX *pixs;
+
+ PROCNAME("convertToPdfDataSegmented");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!filein)
+ return ERROR_INT("filein not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+ if (boxa && scalefactor > 1.0) {
+ L_WARNING("setting scalefactor to 1.0\n", procName);
+ scalefactor = 1.0;
+ }
+
+ if ((pixs = pixRead(filein)) == NULL)
+ return ERROR_INT("pixs not made", procName, 1);
+
+ ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
+ quality, scalefactor,
+ (title) ? title : filein,
+ pdata, pnbytes);
+ pixDestroy(&pixs);
+ return ret;
+}
+
+
+/*!
+ * pixConvertToPdfDataSegmented()
+ *
+ * Input: pixs (any depth, cmap OK)
+ * res (input image resolution; typ. 300 ppi; use 0 for default)
+ * type (compression type for non-image regions; the
+ * image regions are always compressed with L_JPEG_ENCODE)
+ * thresh (used for converting gray --> 1 bpp with L_G4_ENCODE)
+ * boxa (<optional> of image regions; can be null)
+ * quality (used for jpeg image regions; 0 for default)
+ * scalefactor (used for jpeg regions; must be <= 1.0)
+ * title (<optional> pdf title; typically taken from the
+ * input file for the pix)
+ * &data (<return> pdf data in memory)
+ * &nbytes (<return> number of bytes in pdf data)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) See convertToPdfSegmented() for details.
+ */
+l_int32
+pixConvertToPdfDataSegmented(PIX *pixs,
+ l_int32 res,
+ l_int32 type,
+ l_int32 thresh,
+ BOXA *boxa,
+ l_int32 quality,
+ l_float32 scalefactor,
+ const char *title,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+l_int32 i, nbox, seq, bx, by, bw, bh, upscale;
+l_float32 scale;
+BOX *box, *boxc, *box2;
+PIX *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
+PIXCMAP *cmap;
+L_PDF_DATA *lpd;
+
+ PROCNAME("pixConvertToPdfDataSegmented");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!pixs)
+ return ERROR_INT("pixs not defined", procName, 1);
+ if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
+ type != L_FLATE_ENCODE)
+ return ERROR_INT("invalid conversion type", procName, 1);
+ if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
+ L_WARNING("setting scalefactor to 1.0\n", procName);
+ scalefactor = 1.0;
+ }
+
+ /* Adjust scalefactor so that the product with res gives an integer */
+ if (res <= 0)
+ res = DEFAULT_INPUT_RES;
+ scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
+ cmap = pixGetColormap(pixs);
+
+ /* Simple case: single image to be encoded */
+ if (!boxa || boxaGetCount(boxa) == 0) {
+ if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
+ if (cmap)
+ pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
+ else
+ pixt1 = pixConvertTo8(pixs, FALSE);
+ pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
+ pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
+ 0, 0, 2 * res, title, NULL, 0);
+ pixDestroy(&pixt1);
+ pixDestroy(&pixt2);
+ } else {
+ pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
+ 0, 0, res, title, NULL, 0);
+ }
+ return 0;
+ }
+
+ /* Multiple images to be encoded. If @type == L_G4_ENCODE,
+ * jpeg encode a version of pixs that is blanked in the non-image
+ * regions, and paint the scaled non-image part onto it through a mask.
+ * Otherwise, we must put the non-image part down first and
+ * then render all the image regions separately on top of it,
+ * at their own resolution. */
+ pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE); /* non-image */
+ nbox = boxaGetCount(boxa);
+ if (type == L_G4_ENCODE) {
+ pixt2 = pixCreateTemplate(pixs); /* only image regions */
+ pixSetBlackOrWhite(pixt2, L_SET_WHITE);
+ for (i = 0; i < nbox; i++) {
+ box = boxaGetBox(boxa, i, L_CLONE);
+ pix = pixClipRectangle(pixs, box, &boxc);
+ boxGetGeometry(boxc, &bx, &by, &bw, &bh);
+ pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
+ pixDestroy(&pix);
+ boxDestroy(&box);
+ boxDestroy(&boxc);
+ }
+ pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
+ if (pixGetDepth(pixt3) == 1)
+ pixt4 = pixScaleToGray(pixt3, scale);
+ else
+ pixt4 = pixScale(pixt3, scale, scale);
+ pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
+ 0, 0, (l_int32)(scale * res), title,
+ &lpd, L_FIRST_IMAGE);
+
+ if (pixGetDepth(pixt1) == 1) {
+ pixt5 = pixClone(pixt1);
+ upscale = 1;
+ } else {
+ pixt6 = pixConvertTo8(pixt1, 0);
+ pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
+ pixDestroy(&pixt6);
+ upscale = 2;
+ }
+ pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
+ 0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
+ pixDestroy(&pixt2);
+ pixDestroy(&pixt3);
+ pixDestroy(&pixt4);
+ pixDestroy(&pixt5);
+ } else {
+ /* Put the non-image part down first. This is the full
+ size of the page, so we can use it to find the page
+ height in pixels, which is required for determining
+ the LL corner of the image relative to the LL corner
+ of the page. */
+ pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
+ res, title, &lpd, L_FIRST_IMAGE);
+ for (i = 0; i < nbox; i++) {
+ box = boxaGetBox(boxa, i, L_CLONE);
+ pixt2 = pixClipRectangle(pixs, box, &boxc);
+ pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
+ if (pixGetDepth(pixt3) == 1)
+ pixt4 = pixScaleToGray(pixt3, scale);
+ else
+ pixt4 = pixScale(pixt3, scale, scale);
+ box2 = boxTransform(boxc, 0, 0, scale, scale);
+ boxGetGeometry(box2, &bx, &by, NULL, &bh);
+ seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
+ pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
+ bx, by, (l_int32)(scale * res), title,
+ &lpd, seq);
+ pixDestroy(&pixt2);
+ pixDestroy(&pixt3);
+ pixDestroy(&pixt4);
+ boxDestroy(&box);
+ boxDestroy(&boxc);
+ boxDestroy(&box2);
+ }
+ }
+
+ pixDestroy(&pixt1);
+ return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ * Multi-page concatenation *
+ *---------------------------------------------------------------------*/
+/*!
+ * concatenatePdf()
+ *
+ * Input: directory name (containing single-page pdf files)
+ * substr (<optional> substring filter on filenames; can be NULL)
+ * fileout (concatenated pdf file)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This only works with leptonica-formatted single-page pdf files.
+ * (2) If @substr is not NULL, only filenames that contain
+ * the substring can be returned. If @substr == NULL,
+ * none of the filenames are filtered out.
+ * (3) The files in the directory, after optional filtering by
+ * the substring, are lexically sorted in increasing order
+ * before concatenation.
+ */
+l_int32
+concatenatePdf(const char *dirname,
+ const char *substr,
+ const char *fileout)
+{
+l_int32 ret;
+SARRAY *sa;
+
+ PROCNAME("concatenatePdf");
+
+ if (!dirname)
+ return ERROR_INT("dirname not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+
+ if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+ return ERROR_INT("sa not made", procName, 1);
+ ret = saConcatenatePdf(sa, fileout);
+ sarrayDestroy(&sa);
+ return ret;
+}
+
+
+/*!
+ * saConcatenatePdf()
+ *
+ * Input: sarray (of pathnames for single-page pdf files)
+ * fileout (concatenated pdf file)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This only works with leptonica-formatted single-page pdf files.
+ */
+l_int32
+saConcatenatePdf(SARRAY *sa,
+ const char *fileout)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("saConcatenatePdf");
+
+ if (!sa)
+ return ERROR_INT("sa not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+
+ ret = saConcatenatePdfToData(sa, &data, &nbytes);
+ if (ret)
+ return ERROR_INT("pdf data not made", procName, 1);
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ return ret;
+}
+
+
+/*!
+ * ptraConcatenatePdf()
+ *
+ * Input: ptra (array of pdf strings, each for a single-page pdf file)
+ * fileout (concatenated pdf file)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This only works with leptonica-formatted single-page pdf files.
+ */
+l_int32
+ptraConcatenatePdf(L_PTRA *pa,
+ const char *fileout)
+{
+l_uint8 *data;
+l_int32 ret;
+size_t nbytes;
+
+ PROCNAME("ptraConcatenatePdf");
+
+ if (!pa)
+ return ERROR_INT("pa not defined", procName, 1);
+ if (!fileout)
+ return ERROR_INT("fileout not defined", procName, 1);
+
+ ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
+ if (ret)
+ return ERROR_INT("pdf data not made", procName, 1);
+ ret = l_binaryWrite(fileout, "w", data, nbytes);
+ LEPT_FREE(data);
+ return ret;
+}
+
+
+/*!
+ * concatenatePdfToData()
+ *
+ * Input: directory name (containing single-page pdf files)
+ * substr (<optional> substring filter on filenames; can be NULL)
+ * &data (<return> concatenated pdf data in memory)
+ * &nbytes (<return> number of bytes in pdf data)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This only works with leptonica-formatted single-page pdf files.
+ * (2) If @substr is not NULL, only filenames that contain
+ * the substring can be returned. If @substr == NULL,
+ * none of the filenames are filtered out.
+ * (3) The files in the directory, after optional filtering by
+ * the substring, are lexically sorted in increasing order
+ * before concatenation.
+ */
+l_int32
+concatenatePdfToData(const char *dirname,
+ const char *substr,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+l_int32 ret;
+SARRAY *sa;
+
+ PROCNAME("concatenatePdfToData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!dirname)
+ return ERROR_INT("dirname not defined", procName, 1);
+
+ if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+ return ERROR_INT("sa not made", procName, 1);
+ ret = saConcatenatePdfToData(sa, pdata, pnbytes);
+ sarrayDestroy(&sa);
+ return ret;
+}
+
+
+/*!
+ * saConcatenatePdfToData()
+ *
+ * Input: sarray (of pathnames for single-page pdf files)
+ * &data (<return> concatenated pdf data in memory)
+ * &nbytes (<return> number of bytes in pdf data)
+ * Return: 0 if OK, 1 on error
+ *
+ * Notes:
+ * (1) This only works with leptonica-formatted single-page pdf files.
+ */
+l_int32
+saConcatenatePdfToData(SARRAY *sa,
+ l_uint8 **pdata,
+ size_t *pnbytes)
+{
+char *fname;
+l_int32 i, npages, ret;
+L_BYTEA *bas;
+L_PTRA *pa_data; /* input pdf data for each page */
+
+ PROCNAME("saConcatenatePdfToData");
+
+ if (!pdata)
+ return ERROR_INT("&data not defined", procName, 1);
+ *pdata = NULL;
+ if (!pnbytes)
+ return ERROR_INT("&nbytes not defined", procName, 1);
+ *pnbytes = 0;
+ if (!sa)
+ return ERROR_INT("sa not defined", procName, 1);
+
+ /* Read the pdf files into memory */
+ if ((npages = sarrayGetCount(sa)) == 0)
+ return ERROR_INT("no filenames found", procName, 1);
+ pa_data = ptraCreate(npages);
+ for (i = 0; i < npages; i++) {
+ fname = sarrayGetString(sa, i, L_NOCOPY);
+ bas = l_byteaInitFromFile(fname);
+ ptraAdd(pa_data, bas);
+ }
+
+ ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
+
+ /* Cleanup: some pages could have been removed */
+ ptraGetActualCount(pa_data, &npages);
+ for (i = 0; i < npages; i++) {
+ bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
+ l_byteaDestroy(&bas);
+ }
+ ptraDestroy(&pa_data, FALSE, FALSE);
+ return ret;
+}
+
+/* --------------------------------------------*/
+#endif /* USE_PDFIO */
+/* --------------------------------------------*/