1 files changed, 1255 insertions, 0 deletions
diff --git a/src/pageseg.c b/src/pageseg.c
new file mode 100644
index 0000000..b608989
--- /dev/null
+++ b/src/pageseg.c
@@ -0,0 +1,1255 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*
+ *   pageseg.c
+ *
+ *      Top level page segmentation
+ *          l_int32   pixGetRegionsBinary()
+ *
+ *      Halftone region extraction
+ *          PIX      *pixGenHalftoneMask()
+ *
+ *      Textline extraction
+ *          PIX      *pixGenTextlineMask()
+ *
+ *      Textblock extraction
+ *          PIX      *pixGenTextblockMask()
+ *
+ *      Location of page foreground
+ *          PIX      *pixFindPageForeground()
+ *
+ *      Extraction of characters from image with only text
+ *          l_int32   pixSplitIntoCharacters()
+ *          BOXA     *pixSplitComponentWithProfile()
+ *
+ *      Extraction of lines of text
+ *          PIXA     *pixExtractTextlines()
+ *
+ *      Decision text vs photo
+ *          l_int32   pixDecideIfText()
+ *          l_int32   pixFindThreshFgExtent()
+ */
+
+#include "allheaders.h"
+
+    /* These functions are not intended to work on very low-res images */
+static const l_int32  MinWidth = 100;
+static const l_int32  MinHeight = 100;
+
+#ifndef  NO_CONSOLE_IO
+#define  DEBUG_LINES     0
+#endif  /* ~NO_CONSOLE_IO */
+
+/*------------------------------------------------------------------*
+ *                     Top level page segmentation                  *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixGetRegionsBinary()
+ *
+ *      Input:  pixs (1 bpp, assumed to be 300 to 400 ppi)
+ *              &pixhm (<optional return> halftone mask)
+ *              &pixtm (<optional return> textline mask)
+ *              &pixtb (<optional return> textblock mask)
+ *              debug (flag: set to 1 for debug output)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) It is best to deskew the image before segmenting.
+ *      (2) The debug flag enables a number of outputs.  These
+ *          are included to show how to generate and save/display
+ *          these results.
+ */
+l_int32
+pixGetRegionsBinary(PIX     *pixs,
+                    PIX    **ppixhm,
+                    PIX    **ppixtm,
+                    PIX    **ppixtb,
+                    l_int32  debug)
+{
+l_int32  w, h, htfound, tlfound;
+PIX     *pixr, *pix1, *pix2;
+PIX     *pixtext;  /* text pixels only */
+PIX     *pixhm2;   /* halftone mask; 2x reduction */
+PIX     *pixhm;    /* halftone mask;  */
+PIX     *pixtm2;   /* textline mask; 2x reduction */
+PIX     *pixtm;    /* textline mask */
+PIX     *pixvws;   /* vertical white space mask */
+PIX     *pixtb2;   /* textblock mask; 2x reduction */
+PIX     *pixtbf2;  /* textblock mask; 2x reduction; small comps filtered */
+PIX     *pixtb;    /* textblock mask */
+
+    PROCNAME("pixGetRegionsBinary");
+
+    if (ppixhm) *ppixhm = NULL;
+    if (ppixtm) *ppixtm = NULL;
+    if (ppixtb) *ppixtb = NULL;
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return ERROR_INT("pixs undefined or not 1 bpp", procName, 1);
+    pixGetDimensions(pixs, &w, &h, NULL);
+    if (w < MinWidth || h < MinHeight) {
+        L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
+        return 1;
+    }
+
+        /* 2x reduce, to 150 -200 ppi */
+    pixr = pixReduceRankBinaryCascade(pixs, 1, 0, 0, 0);
+    pixDisplayWrite(pixr, debug);
+
+        /* Get the halftone mask */
+    pixhm2 = pixGenHalftoneMask(pixr, &pixtext, &htfound, debug);
+
+        /* Get the textline mask from the text pixels */
+    pixtm2 = pixGenTextlineMask(pixtext, &pixvws, &tlfound, debug);
+
+        /* Get the textblock mask from the textline mask */
+    pixtb2 = pixGenTextblockMask(pixtm2, pixvws, debug);
+    pixDestroy(&pixr);
+    pixDestroy(&pixtext);
+    pixDestroy(&pixvws);
+
+        /* Remove small components from the mask, where a small
+         * component is defined as one with both width and height < 60 */
+    pixtbf2 = pixSelectBySize(pixtb2, 60, 60, 4, L_SELECT_IF_EITHER,
+                              L_SELECT_IF_GTE, NULL);
+    pixDestroy(&pixtb2);
+    pixDisplayWriteFormat(pixtbf2, debug, IFF_PNG);
+
+        /* Expand all masks to full resolution, and do filling or
+         * small dilations for better coverage. */
+    pixhm = pixExpandReplicate(pixhm2, 2);
+    pix1 = pixSeedfillBinary(NULL, pixhm, pixs, 8);
+    pixOr(pixhm, pixhm, pix1);
+    pixDestroy(&pix1);
+    pixDisplayWriteFormat(pixhm, debug, IFF_PNG);
+
+    pix1 = pixExpandReplicate(pixtm2, 2);
+    pixtm = pixDilateBrick(NULL, pix1, 3, 3);
+    pixDestroy(&pix1);
+    pixDisplayWriteFormat(pixtm, debug, IFF_PNG);
+
+    pix1 = pixExpandReplicate(pixtbf2, 2);
+    pixtb = pixDilateBrick(NULL, pix1, 3, 3);
+    pixDestroy(&pix1);
+    pixDisplayWriteFormat(pixtb, debug, IFF_PNG);
+
+    pixDestroy(&pixhm2);
+    pixDestroy(&pixtm2);
+    pixDestroy(&pixtbf2);
+
+        /* Debug: identify objects that are neither text nor halftone image */
+    if (debug) {
+        pix1 = pixSubtract(NULL, pixs, pixtm);  /* remove text pixels */
+        pix2 = pixSubtract(NULL, pix1, pixhm);  /* remove halftone pixels */
+        pixDisplayWriteFormat(pix2, 1, IFF_PNG);
+        pixDestroy(&pix1);
+        pixDestroy(&pix2);
+    }
+
+        /* Debug: display textline components with random colors */
+    if (debug) {
+        l_int32  w, h;
+        BOXA    *boxa;
+        PIXA    *pixa;
+        boxa = pixConnComp(pixtm, &pixa, 8);
+        pixGetDimensions(pixtm, &w, &h, NULL);
+        pix1 = pixaDisplayRandomCmap(pixa, w, h);
+        pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
+        pixDisplay(pix1, 100, 100);
+        pixDisplayWriteFormat(pix1, 1, IFF_PNG);
+        pixaDestroy(&pixa);
+        boxaDestroy(&boxa);
+        pixDestroy(&pix1);
+    }
+
+        /* Debug: identify the outlines of each textblock */
+    if (debug) {
+        PIXCMAP  *cmap;
+        PTAA     *ptaa;
+        ptaa = pixGetOuterBordersPtaa(pixtb);
+        lept_mkdir("pageseg");
+        ptaaWrite("/tmp/pageseg/tb_outlines.ptaa", ptaa, 1);
+        pix1 = pixRenderRandomCmapPtaa(pixtb, ptaa, 1, 16, 1);
+        cmap = pixGetColormap(pix1);
+        pixcmapResetColor(cmap, 0, 130, 130, 130);
+        pixDisplay(pix1, 500, 100);
+        pixDisplayWriteFormat(pix1, 1, IFF_PNG);
+        pixDestroy(&pix1);
+        ptaaDestroy(&ptaa);
+    }
+
+        /* Debug: get b.b. for all mask components */
+    if (debug) {
+        BOXA  *bahm, *batm, *batb;
+        bahm = pixConnComp(pixhm, NULL, 4);
+        batm = pixConnComp(pixtm, NULL, 4);
+        batb = pixConnComp(pixtb, NULL, 4);
+        boxaWrite("/tmp/pageseg/htmask.boxa", bahm);
+        boxaWrite("/tmp/pageseg/textmask.boxa", batm);
+        boxaWrite("/tmp/pageseg/textblock.boxa", batb);
+        boxaDestroy(&bahm);
+        boxaDestroy(&batm);
+        boxaDestroy(&batb);
+    }
+
+    if (ppixhm)
+        *ppixhm = pixhm;
+    else
+        pixDestroy(&pixhm);
+    if (ppixtm)
+        *ppixtm = pixtm;
+    else
+        pixDestroy(&pixtm);
+    if (ppixtb)
+        *ppixtb = pixtb;
+    else
+        pixDestroy(&pixtb);
+
+    return 0;
+}
+
+
+/*------------------------------------------------------------------*
+ *                    Halftone region extraction                    *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixGenHalftoneMask()
+ *
+ *      Input:  pixs (1 bpp, assumed to be 150 to 200 ppi)
+ *              &pixtext (<optional return> text part of pixs)
+ *              &htfound (<optional return> 1 if the mask is not empty)
+ *              debug (flag: 1 for debug output)
+ *      Return: pixd (halftone mask), or null on error
+ *
+ *  Notes:
+ *      (1) This is not intended to work on small thumbnails.  The
+ *          dimensions of pixs must be at least MinWidth x MinHeight.
+ */
+PIX *
+pixGenHalftoneMask(PIX      *pixs,
+                   PIX     **ppixtext,
+                   l_int32  *phtfound,
+                   l_int32   debug)
+{
+l_int32  w, h, empty;
+PIX     *pix1, *pix2, *pixhs, *pixhm, *pixd;
+
+    PROCNAME("pixGenHalftoneMask");
+
+    if (ppixtext) *ppixtext = NULL;
+    if (phtfound) *phtfound = 0;
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
+    pixGetDimensions(pixs, &w, &h, NULL);
+    if (w < MinWidth || h < MinHeight) {
+        L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
+        return NULL;
+    }
+
+        /* Compute seed for halftone parts at 8x reduction */
+    pix1 = pixReduceRankBinaryCascade(pixs, 4, 4, 3, 0);
+    pix2 = pixOpenBrick(NULL, pix1, 5, 5);
+    pixhs = pixExpandReplicate(pix2, 8);  /* back to 2x reduction */
+    pixDestroy(&pix1);
+    pixDestroy(&pix2);
+    pixDisplayWriteFormat(pixhs, debug, IFF_PNG);
+
+        /* Compute mask for connected regions */
+    pixhm = pixCloseSafeBrick(NULL, pixs, 4, 4);
+    pixDisplayWriteFormat(pixhm, debug, IFF_PNG);
+
+        /* Fill seed into mask to get halftone mask */
+    pixd = pixSeedfillBinary(NULL, pixhs, pixhm, 4);
+
+#if 0
+        /* Moderate opening to remove thin lines, etc. */
+    pixOpenBrick(pixd, pixd, 10, 10);
+    pixDisplayWrite(pixd, debug);
+#endif
+
+        /* Check if mask is empty */
+    pixZero(pixd, &empty);
+    if (phtfound && !empty)
+        *phtfound = 1;
+
+        /* Optionally, get all pixels that are not under the halftone mask */
+    if (ppixtext) {
+        if (empty)
+            *ppixtext = pixCopy(NULL, pixs);
+        else
+            *ppixtext = pixSubtract(NULL, pixs, pixd);
+        pixDisplayWriteFormat(*ppixtext, debug, IFF_PNG);
+    }
+
+    pixDestroy(&pixhs);
+    pixDestroy(&pixhm);
+    return pixd;
+}
+
+
+/*------------------------------------------------------------------*
+ *                         Textline extraction                      *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixGenTextlineMask()
+ *
+ *      Input:  pixs (1 bpp, assumed to be 150 to 200 ppi)
+ *              &pixvws (<return> vertical whitespace mask)
+ *              &tlfound (<optional return> 1 if the mask is not empty)
+ *              debug (flag: 1 for debug output)
+ *      Return: pixd (textline mask), or null on error
+ *
+ *  Notes:
+ *      (1) The input pixs should be deskewed.
+ *      (2) pixs should have no halftone pixels.
+ *      (3) This is not intended to work on small thumbnails.  The
+ *          dimensions of pixs must be at least MinWidth x MinHeight.
+ *      (4) Both the input image and the returned textline mask
+ *          are at the same resolution.
+ */
+PIX *
+pixGenTextlineMask(PIX      *pixs,
+                   PIX     **ppixvws,
+                   l_int32  *ptlfound,
+                   l_int32   debug)
+{
+l_int32  w, h, empty;
+PIX     *pix1, *pix2, *pixvws, *pixd;
+
+    PROCNAME("pixGenTextlineMask");
+
+    if (ptlfound) *ptlfound = 0;
+    if (!ppixvws)
+        return (PIX *)ERROR_PTR("&pixvws not defined", procName, NULL);
+    *ppixvws = NULL;
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
+    pixGetDimensions(pixs, &w, &h, NULL);
+    if (w < MinWidth || h < MinHeight) {
+        L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
+        return NULL;
+    }
+
+        /* First we need a vertical whitespace mask.  Invert the image. */
+    pix1 = pixInvert(NULL, pixs);
+
+        /* The whitespace mask will break textlines where there
+         * is a large amount of white space below or above.
+         * This can be prevented by identifying regions of the
+         * inverted image that have large horizontal extent (bigger than
+         * the separation between columns) and significant
+         * vertical extent (bigger than the separation between
+         * textlines), and subtracting this from the bg. */
+    pix2 = pixMorphCompSequence(pix1, "o80.60", 0);
+    pixSubtract(pix1, pix1, pix2);
+    pixDisplayWriteFormat(pix1, debug, IFF_PNG);
+    pixDestroy(&pix2);
+
+        /* Identify vertical whitespace by opening the remaining bg.
+         * o5.1 removes thin vertical bg lines and o1.200 extracts
+         * long vertical bg lines. */
+    pixvws = pixMorphCompSequence(pix1, "o5.1 + o1.200", 0);
+    *ppixvws = pixvws;
+    pixDisplayWriteFormat(pixvws, debug, IFF_PNG);
+    pixDestroy(&pix1);
+
+        /* Three steps to getting text line mask:
+         *   (1) close the characters and words in the textlines
+         *   (2) open the vertical whitespace corridors back up
+         *   (3) small opening to remove noise    */
+    pix1 = pixCloseSafeBrick(NULL, pixs, 30, 1);
+    pixDisplayWrite(pix1, debug);
+    pixd = pixSubtract(NULL, pix1, pixvws);
+    pixOpenBrick(pixd, pixd, 3, 3);
+    pixDisplayWriteFormat(pixd, debug, IFF_PNG);
+    pixDestroy(&pix1);
+
+        /* Check if text line mask is empty */
+    if (ptlfound) {
+        pixZero(pixd, &empty);
+        if (!empty)
+            *ptlfound = 1;
+    }
+
+    return pixd;
+}
+
+
+/*------------------------------------------------------------------*
+ *                       Textblock extraction                       *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixGenTextblockMask()
+ *
+ *      Input:  pixs (1 bpp, textline mask, assumed to be 150 to 200 ppi)
+ *              pixvws (vertical white space mask)
+ *              debug (flag: 1 for debug output)
+ *      Return: pixd (textblock mask), or null on error
+ *
+ *  Notes:
+ *      (1) Both the input masks (textline and vertical white space) and
+ *          the returned textblock mask are at the same resolution.
+ *      (2) This is not intended to work on small thumbnails.  The
+ *          dimensions of pixs must be at least MinWidth x MinHeight.
+ *      (3) The result is somewhat noisy, in that small "blocks" of
+ *          text may be included.  These can be removed by post-processing,
+ *          using, e.g.,
+ *             pixSelectBySize(pix, 60, 60, 4, L_SELECT_IF_EITHER,
+ *                             L_SELECT_IF_GTE, NULL);
+ */
+PIX *
+pixGenTextblockMask(PIX     *pixs,
+                    PIX     *pixvws,
+                    l_int32  debug)
+{
+l_int32  w, h;
+PIX     *pix1, *pix2, *pix3, *pixd;
+
+    PROCNAME("pixGenTextblockMask");
+
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
+    pixGetDimensions(pixs, &w, &h, NULL);
+    if (w < MinWidth || h < MinHeight) {
+        L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
+        return NULL;
+    }
+    if (!pixvws)
+        return (PIX *)ERROR_PTR("pixvws not defined", procName, NULL);
+
+        /* Join pixels vertically to make a textblock mask */
+    pix1 = pixMorphSequence(pixs, "c1.10 + o4.1", 0);
+    pixDisplayWriteFormat(pix1, debug, IFF_PNG);
+
+        /* Solidify the textblock mask and remove noise:
+         *   (1) For each cc, close the blocks and dilate slightly
+         *       to form a solid mask.
+         *   (2) Small horizontal closing between components.
+         *   (3) Open the white space between columns, again.
+         *   (4) Remove small components. */
+    pix2 = pixMorphSequenceByComponent(pix1, "c30.30 + d3.3", 8, 0, 0, NULL);
+    pixCloseSafeBrick(pix2, pix2, 10, 1);
+    pixDisplayWriteFormat(pix2, debug, IFF_PNG);
+    pix3 = pixSubtract(NULL, pix2, pixvws);
+    pixDisplayWriteFormat(pix3, debug, IFF_PNG);
+    pixd = pixSelectBySize(pix3, 25, 5, 8, L_SELECT_IF_BOTH,
+                            L_SELECT_IF_GTE, NULL);
+    pixDisplayWriteFormat(pixd, debug, IFF_PNG);
+
+    pixDestroy(&pix1);
+    pixDestroy(&pix2);
+    pixDestroy(&pix3);
+    return pixd;
+}
+
+
+/*------------------------------------------------------------------*
+ *                    Location of page foreground                   *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixFindPageForeground()
+ *
+ *      Input:  pixs (full resolution (any type or depth)
+ *              threshold (for binarization; typically about 128)
+ *              mindist (min distance of text from border to allow
+ *                       cleaning near border; at 2x reduction, this
+ *                       should be larger than 50; typically about 70)
+ *              erasedist (when conditions are satisfied, erase anything
+ *                         within this distance of the edge;
+ *                         typically 30 at 2x reduction)
+ *              pagenum (use for debugging when called repeatedly; labels
+ *                       debug images that are assembled into pdfdir)
+ *              showmorph (set to a negative integer to show steps in
+ *                         generating masks; this is typically used
+ *                         for debugging region extraction)
+ *              display (set to 1  to display mask and selected region
+ *                       for debugging a single page)
+ *              pdfdir (subdirectory of /tmp where images showing the
+ *                      result are placed when called repeatedly; use
+ *                      null if no output requested)
+ *      Return: box (region including foreground, with some pixel noise
+ *                   removed), or null if not found
+ *
+ *  Notes:
+ *      (1) This doesn't simply crop to the fg.  It attempts to remove
+ *          pixel noise and junk at the edge of the image before cropping.
+ *          The input @threshold is used if pixs is not 1 bpp.
+ *      (2) There are several debugging options, determined by the
+ *          last 4 arguments.
+ *      (3) This is not intended to work on small thumbnails.  The
+ *          dimensions of pixs must be at least MinWidth x MinHeight.
+ *      (4) If you want pdf output of results when called repeatedly,
+ *          the pagenum arg labels the images written, which go into
+ *          /tmp/lept/<pdfdir>/<pagenum>.png.  In that case,
+ *          you would clean out the /tmp directory before calling this
+ *          function on each page:
+ *              lept_rmdir("/lept/<pdfdir>");
+ *              lept_mkdir("/lept/<pdfdir>");
+ */
+BOX *
+pixFindPageForeground(PIX         *pixs,
+                      l_int32      threshold,
+                      l_int32      mindist,
+                      l_int32      erasedist,
+                      l_int32      pagenum,
+                      l_int32      showmorph,
+                      l_int32      display,
+                      const char  *pdfdir)
+{
+char     buf[64];
+l_int32  flag, nbox, intersects;
+l_int32  w, h, bx, by, bw, bh, left, right, top, bottom;
+PIX     *pixb, *pixb2, *pixseed, *pixsf, *pixm, *pix1, *pixg2;
+BOX     *box, *boxfg, *boxin, *boxd;
+BOXA    *ba1, *ba2;
+
+    PROCNAME("pixFindPageForeground");
+
+    if (!pixs)
+        return (BOX *)ERROR_PTR("pixs not defined", procName, NULL);
+    pixGetDimensions(pixs, &w, &h, NULL);
+    if (w < MinWidth || h < MinHeight) {
+        L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h);
+        return NULL;
+    }
+
+        /* Binarize, downscale by 0.5, remove the noise to generate a seed,
+         * and do a seedfill back from the seed into those 8-connected
+         * components of the binarized image for which there was at least
+         * one seed pixel.  Also clear out any components that are within
+         * 10 pixels of the edge at 2x reduction. */
+    flag = (showmorph) ? -1 : 0;  /* if showmorph == -1, write intermediate
+                                   * images to /tmp/seq_output_1.pdf */
+    pixb = pixConvertTo1(pixs, threshold);
+    pixb2 = pixScale(pixb, 0.5, 0.5);
+    pixseed = pixMorphSequence(pixb2, "o1.2 + c9.9 + o3.5", flag);
+    pixsf = pixSeedfillBinary(NULL, pixseed, pixb2, 8);
+    pixSetOrClearBorder(pixsf, 10, 10, 10, 10, PIX_SET);
+    pixm = pixRemoveBorderConnComps(pixsf, 8);
+    if (display) pixDisplay(pixm, 100, 100);
+
+        /* Now, where is the main block of text?  We want to remove noise near
+         * the edge of the image, but to do that, we have to be convinced that
+         * (1) there is noise and (2) it is far enough from the text block
+         * and close enough to the edge.  For each edge, if the block
+         * is more than mindist from that edge, then clean 'erasedist'
+         * pixels from the edge. */
+    pix1 = pixMorphSequence(pixm, "c50.50", flag - 1);
+    ba1 = pixConnComp(pix1, NULL, 8);
+    ba2 = boxaSort(ba1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL);
+    pixGetDimensions(pix1, &w, &h, NULL);
+    nbox = boxaGetCount(ba2);
+    if (nbox > 1) {
+        box = boxaGetBox(ba2, 0, L_CLONE);
+        boxGetGeometry(box, &bx, &by, &bw, &bh);
+        left = (bx > mindist) ? erasedist : 0;
+        right = (w - bx - bw > mindist) ? erasedist : 0;
+        top = (by > mindist) ? erasedist : 0;
+        bottom = (h - by - bh > mindist) ? erasedist : 0;
+        pixSetOrClearBorder(pixm, left, right, top, bottom, PIX_CLR);
+        boxDestroy(&box);
+    }
+    pixDestroy(&pix1);
+    boxaDestroy(&ba1);
+    boxaDestroy(&ba2);
+
+        /* Locate the foreground region; don't bother cropping */
+    pixClipToForeground(pixm, NULL, &boxfg);
+
+        /* Sanity check the fg region.  Make sure it's not confined
+         * to a thin boundary on the left and right sides of the image,
+         * in which case it is likely to be noise. */
+    if (boxfg) {
+        boxin = boxCreate(0.1 * w, 0, 0.8 * w, h);
+        boxIntersects(boxfg, boxin, &intersects);
+        if (!intersects) {
+            L_INFO("found only noise on page %d\n", procName, pagenum);
+            boxDestroy(&boxfg);
+        }
+        boxDestroy(&boxin);
+    }
+
+    boxd = NULL;
+    if (!boxfg) {
+        L_INFO("no fg region found for page %d\n", procName, pagenum);
+    } else {
+        boxAdjustSides(boxfg, boxfg, -2, 2, -2, 2);  /* tiny expansion */
+        boxd = boxTransform(boxfg, 0, 0, 2.0, 2.0);
+
+            /* Write image showing box for this page.  This is to be
+             * bundled up into a pdf of all the pages, which can be
+             * generated by convertFilesToPdf()  */
+        if (pdfdir) {
+            snprintf(buf, sizeof(buf), "lept/%s", pdfdir);
+            lept_mkdir(buf);
+
+            pixg2 = pixConvert1To4Cmap(pixb);
+            pixRenderBoxArb(pixg2, boxd, 3, 255, 0, 0);
+            snprintf(buf, sizeof(buf), "/tmp/lept/%s/%04d.png",
+                     pdfdir, pagenum);
+            if (display) pixDisplay(pixg2, 700, 100);
+            pixWrite(buf, pixg2, IFF_PNG);
+            pixDestroy(&pixg2);
+        }
+    }
+
+    pixDestroy(&pixb);
+    pixDestroy(&pixb2);
+    pixDestroy(&pixseed);
+    pixDestroy(&pixsf);
+    pixDestroy(&pixm);
+    boxDestroy(&boxfg);
+    return boxd;
+}
+
+
+/*------------------------------------------------------------------*
+ *         Extraction of characters from image with only text       *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixSplitIntoCharacters()
+ *
+ *      Input:  pixs (1 bpp, contains only deskewed text)
+ *              minw (minimum component width for initial filtering; typ. 4)
+ *              minh (minimum component height for initial filtering; typ. 4)
+ *              &boxa (<optional return> character bounding boxes)
+ *              &pixa (<optional return> character images)
+ *              &pixdebug (<optional return> showing splittings)
+ *
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) This is a simple function that attempts to find split points
+ *          based on vertical pixel profiles.
+ *      (2) It should be given an image that has an arbitrary number
+ *          of text characters.
+ *      (3) The returned pixa includes the boxes from which the
+ *          (possibly split) components are extracted.
+ */
+l_int32
+pixSplitIntoCharacters(PIX     *pixs,
+                       l_int32  minw,
+                       l_int32  minh,
+                       BOXA   **pboxa,
+                       PIXA   **ppixa,
+                       PIX    **ppixdebug)
+{
+l_int32  ncomp, i, xoff, yoff;
+BOXA   *boxa1, *boxa2, *boxat1, *boxat2, *boxad;
+BOXAA  *baa;
+PIX    *pix, *pix1, *pix2, *pixdb;
+PIXA   *pixa1, *pixadb;
+
+    PROCNAME("pixSplitIntoCharacters");
+
+    if (pboxa) *pboxa = NULL;
+    if (ppixa) *ppixa = NULL;
+    if (ppixdebug) *ppixdebug = NULL;
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
+
+        /* Remove the small stuff */
+    pix1 = pixSelectBySize(pixs, minw, minh, 8, L_SELECT_IF_BOTH,
+                           L_SELECT_IF_GT, NULL);
+
+        /* Small vertical close for consolidation */
+    pix2 = pixMorphSequence(pix1, "c1.10", 0);
+    pixDestroy(&pix1);
+
+        /* Get the 8-connected components */
+    boxa1 = pixConnComp(pix2, &pixa1, 8);
+    pixDestroy(&pix2);
+    boxaDestroy(&boxa1);
+
+        /* Split the components if obvious */
+    ncomp = pixaGetCount(pixa1);
+    boxa2 = boxaCreate(ncomp);
+    pixadb = (ppixdebug) ? pixaCreate(ncomp) : NULL;
+    for (i = 0; i < ncomp; i++) {
+        pix = pixaGetPix(pixa1, i, L_CLONE);
+        if (ppixdebug) {
+            boxat1 = pixSplitComponentWithProfile(pix, 10, 7, &pixdb);
+            if (pixdb)
+                pixaAddPix(pixadb, pixdb, L_INSERT);
+        } else {
+            boxat1 = pixSplitComponentWithProfile(pix, 10, 7, NULL);
+        }
+        pixaGetBoxGeometry(pixa1, i, &xoff, &yoff, NULL, NULL);
+        boxat2 = boxaTransform(boxat1, xoff, yoff, 1.0, 1.0);
+        boxaJoin(boxa2, boxat2, 0, -1);
+        pixDestroy(&pix);
+        boxaDestroy(&boxat1);
+        boxaDestroy(&boxat2);
+    }
+    pixaDestroy(&pixa1);
+
+        /* Generate the debug image */
+    if (ppixdebug) {
+        if (pixaGetCount(pixadb) > 0) {
+            *ppixdebug = pixaDisplayTiledInRows(pixadb, 32, 1500,
+                                                1.0, 0, 20, 1);
+        }
+        pixaDestroy(&pixadb);
+    }
+
+        /* Do a 2D sort on the bounding boxes, and flatten the result to 1D */
+    baa = boxaSort2d(boxa2, NULL, 0, 0, 5);
+    boxad = boxaaFlattenToBoxa(baa, NULL, L_CLONE);
+    boxaaDestroy(&baa);
+    boxaDestroy(&boxa2);
+
+        /* Optionally extract the pieces from the input image */
+    if (ppixa)
+        *ppixa = pixClipRectangles(pixs, boxad);
+    if (pboxa)
+        *pboxa = boxad;
+    else
+        boxaDestroy(&boxad);
+    return 0;
+}
+
+
+/*!
+ *  pixSplitComponentWithProfile()
+ *
+ *      Input:  pixs (1 bpp, exactly one connected component)
+ *              delta (distance used in extrema finding in a numa; typ. 10)
+ *              mindel (minimum required difference between profile minimum
+ *                      and profile values +2 and -2 away; typ. 7)
+ *              &pixdebug (<optional return> debug image of splitting)
+ *      Return: boxa (of c.c. after splitting), or null on error
+ *
+ *  Notes:
+ *      (1) This will split the most obvious cases of touching characters.
+ *          The split points it is searching for are narrow and deep
+ *          minimima in the vertical pixel projection profile, after a
+ *          large vertical closing has been applied to the component.
+ */
+BOXA *
+pixSplitComponentWithProfile(PIX     *pixs,
+                             l_int32  delta,
+                             l_int32  mindel,
+                             PIX    **ppixdebug)
+{
+l_int32   w, h, n2, i, firstmin, xmin, xshift;
+l_int32   nmin, nleft, nright, nsplit, isplit, ncomp;
+l_int32  *array1, *array2;
+BOX      *box;
+BOXA     *boxad;
+NUMA     *na1, *na2, *nasplit;
+PIX      *pix1, *pixdb;
+
+    PROCNAME("pixSplitComponentsWithProfile");
+
+    if (ppixdebug) *ppixdebug = NULL;
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return (BOXA *)ERROR_PTR("pixa undefined or not 1 bpp", procName, NULL);
+    pixGetDimensions(pixs, &w, &h, NULL);
+
+        /* Closing to consolidate characters vertically */
+    pix1 = pixCloseSafeBrick(NULL, pixs, 1, 100);
+
+        /* Get extrema of column projections */
+    boxad = boxaCreate(2);
+    na1 = pixCountPixelsByColumn(pix1);  /* w elements */
+    pixDestroy(&pix1);
+    na2 = numaFindExtrema(na1, delta);
+    n2 = numaGetCount(na2);
+    if (n2 < 3) {  /* no split possible */
+        box = boxCreate(0, 0, w, h);
+        boxaAddBox(boxad, box, L_INSERT);
+        numaDestroy(&na1);
+        numaDestroy(&na2);
+        return boxad;
+    }
+
+        /* Look for sufficiently deep and narrow minima.
+         * All minima of of interest must be surrounded by max on each
+         * side.  firstmin is the index of first possible minimum. */
+    array1 = numaGetIArray(na1);
+    array2 = numaGetIArray(na2);
+    if (ppixdebug) numaWriteStream(stderr, na2);
+    firstmin = (array1[array2[0]] > array1[array2[1]]) ? 1 : 2;
+    nasplit = numaCreate(n2);  /* will hold split locations */
+    for (i = firstmin; i < n2 - 1; i+= 2) {
+        xmin = array2[i];
+        nmin = array1[xmin];
+        if (xmin + 2 >= w) break;  /* no more splits possible */
+        nleft = array1[xmin - 2];
+        nright = array1[xmin + 2];
+        if (ppixdebug) {
+            fprintf(stderr,
+                "Splitting: xmin = %d, w = %d; nl = %d, nmin = %d, nr = %d\n",
+                xmin, w, nleft, nmin, nright);
+        }
+        if (nleft - nmin >= mindel && nright - nmin >= mindel)  /* split */
+            numaAddNumber(nasplit, xmin);
+    }
+    nsplit = numaGetCount(nasplit);
+
+#if 0
+    if (ppixdebug && nsplit > 0)
+        gplotSimple1(na1, GPLOT_X11, "/tmp/splitroot", NULL);
+#endif
+
+    numaDestroy(&na1);
+    numaDestroy(&na2);
+    LEPT_FREE(array1);
+    LEPT_FREE(array2);
+
+    if (nsplit == 0) {  /* no splitting */
+        box = boxCreate(0, 0, w, h);
+        boxaAddBox(boxad, box, L_INSERT);
+        return boxad;
+    }
+
+        /* Use split points to generate b.b. after splitting */
+    for (i = 0, xshift = 0; i < nsplit; i++) {
+        numaGetIValue(nasplit, i, &isplit);
+        box = boxCreate(xshift, 0, isplit - xshift, h);
+        boxaAddBox(boxad, box, L_INSERT);
+        xshift = isplit + 1;
+    }
+    box = boxCreate(xshift, 0, w - xshift, h);
+    boxaAddBox(boxad, box, L_INSERT);
+
+    numaDestroy(&nasplit);
+
+    if (ppixdebug) {
+        pixdb = pixConvertTo32(pixs);
+        ncomp = boxaGetCount(boxad);
+        for (i = 0; i < ncomp; i++) {
+            box = boxaGetBox(boxad, i, L_CLONE);
+            pixRenderBoxBlend(pixdb, box, 1, 255, 0, 0, 0.5);
+            boxDestroy(&box);
+        }
+        *ppixdebug = pixdb;
+    }
+
+    return boxad;
+}
+
+
+/*------------------------------------------------------------------*
+ *                    Extraction of lines of text                   *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixExtractTextlines()
+ *
+ *      Input:  pixs (any depth, assumed to have nearly horizontal text)
+ *              maxw, maxh (initial filtering: remove any components in pixs
+ *                          with components larger than maxw or maxh)
+ *              minw, minh (final filtering: remove extracted 'lines'
+ *                          with sizes smaller than minw or minh)
+ *      Return: pixa (of textline images, including bounding boxes), or
+ *                    null on error
+ *
+ *  Notes:
+ *      (1) This first removes components from pixs that are either
+ *          wide (> @maxw) or tall (> @maxh).
+ *      (2) This function assumes that textlines have sufficient
+ *          vertical separation and small enough skew so that a
+ *          horizontal dilation sufficient to join words will not join
+ *          textlines.  Images with multiple columns of text may have
+ *          the textlines join across the space between columns.
+ *      (3) A final filtering operation removes small components, such
+ *          that width < @minw or height < @minh.
+ *      (4) For reasonable accuracy, the resolution of pixs should be
+ *          at least 100 ppi.  For reasonable efficiency, the resolution
+ *          should not exceed 600 ppi.
+ *      (5) This can be used to determine if some region of a scanned
+ *          image is horizontal text.
+ *      (6) As an example, for a pix with resolution 300 ppi, a reasonable
+ *          set of parameters is:
+ *             pixExtractTextlines(pix, 150, 150, 10, 5);
+ */
+PIXA *
+pixExtractTextlines(PIX     *pixs,
+                    l_int32  maxw,
+                    l_int32  maxh,
+                    l_int32  minw,
+                    l_int32  minh)
+{
+char     buf[64];
+l_int32  i, n, res, csize, empty;
+BOX     *box;
+BOXA    *boxa1, *boxa2;
+PIX     *pix1, *pix2, *pix3, *pix4, *pix5;
+PIXA    *pixa1, *pixa2, *pixa3, *pixad;
+
+    PROCNAME("pixExtractTextlines");
+
+    if (!pixs)
+        return (PIXA *)ERROR_PTR("pixs not defined", procName, NULL);
+
+        /* Binarize carefully, if necessary */
+    if (pixGetDepth(pixs) > 1) {
+        pix2 = pixConvertTo8(pixs, FALSE);
+        pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 190);
+        pix1 = pixThresholdToBinary(pix3, 150);
+        pixDestroy(&pix3);
+        pixDestroy(&pix3);
+    } else {
+        pix1 = pixClone(pixs);
+    }
+    pixZero(pix1, &empty);
+    if (empty) {
+        pixDestroy(&pix1);
+        L_INFO("no fg pixels in input image\n", procName);
+        return NULL;
+    }
+
+        /* Remove any very tall or very wide connected components */
+    pix2 = pixSelectBySize(pix1, maxw, maxh, 8, L_SELECT_IF_BOTH,
+                           L_SELECT_IF_LT, NULL);
+    pixDestroy(&pix1);
+
+        /* Filter to solidify the text lines within the x-height region.
+         * The closing (csize) bridges gaps between words.  The opening
+         * removes isolated bridges between textlines. */
+    if ((res = pixGetXRes(pixs)) == 0) {
+        L_INFO("Resolution is not set: setting to 300 ppi\n", procName);
+        res = 300;
+    }
+    csize = L_MIN(120., 60.0 * (res / 300));
+    snprintf(buf, sizeof(buf), "c%d.1 + o20.1", csize);
+    pix3 = pixMorphCompSequence(pix2, buf, 0);
+
+        /* Extract the connected components.  These should be dilated lines */
+    boxa1 = pixConnComp(pix3, &pixa1, 4);
+    pixDestroy(&pix3);
+
+        /* Remove line components that are too small */
+    pixa2 = pixaSelectBySize(pixa1, minw, minh, L_SELECT_IF_BOTH,
+                           L_SELECT_IF_GTE, NULL);
+
+#if DEBUG_LINES
+    pix1 = pixaDisplayRandomCmap(pixa2, 0, 0);
+    pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255);
+    pixWrite("/tmp/lept/junklines.png", pix1, IFF_PNG);
+    pixDestroy(&pix1);
+#endif
+
+        /* Selectively AND with the version before dilation, and save */
+    boxa2 = pixaGetBoxa(pixa2, L_CLONE);
+    n = boxaGetCount(boxa2);
+    pixa3 = pixClipRectangles(pix2, boxa2);
+    pixad = pixaCreate(n);
+    for (i = 0; i < n; i++) {
+        pix4 = pixaGetPix(pixa2, i, L_CLONE);
+        pix5 = pixaGetPix(pixa3, i, L_COPY);
+        pixAnd(pix5, pix5, pix4);
+        pixaAddPix(pixad, pix5, L_INSERT);
+        box = boxaGetBox(boxa2, i, L_COPY);
+        pixaAddBox(pixad, box, L_INSERT);
+        pixDestroy(&pix4);
+    }
+
+    pixDestroy(&pix2);
+    pixaDestroy(&pixa1);
+    pixaDestroy(&pixa2);
+    pixaDestroy(&pixa3);
+    boxaDestroy(&boxa1);
+    boxaDestroy(&boxa2);
+    return pixad;
+}
+
+
+/*------------------------------------------------------------------*
+ *                      Decision text vs photo                      *
+ *------------------------------------------------------------------*/
+/*!
+ *  pixDecideIfText()
+ *
+ *      Input:  pixs (any depth)
+ *              box (<optional> if null, use entire pixs)
+ *              &istext (<return> 1 if text; 0 if photo; -1 if not determined)
+ *              pixadb (<optional> pre-allocated, for showing intermediate
+ *                      computation; use null to skip)
+ *      Return: 0 if OK, 1 on error
+ *
+ *  Notes:
+ *      (1) It is assumed that pixs has the correct resolution set.
+ *          If the resolution is 0, we set to 300 and issue a warning.
+ *      (2) If necessary, the image is scaled to 300 ppi; most of the
+ *          processing is done at this resolution.
+ *      (3) Text is assumed to be in horizontal lines.
+ *      (4) Because thin vertical lines are removed before filtering for
+ *          text lines, this should identify tables as text.
+ *      (5) If @box is null and pixs contains both text lines and line art,
+ *          this function might return @istext == true.
+ *      (6) If the input pixs is empty, or for some other reason the
+ *          result can not be determined, return -1.
+ *      (7) For debug output, input a pre-allocated pixa.
+ */
+l_int32
+pixDecideIfText(PIX      *pixs,
+                BOX      *box,
+                l_int32  *pistext,
+                PIXA     *pixadb)
+{
+l_int32    i, empty, maxw, maxh, w, h, n1, n2, n3, minlines;
+l_int32    res, big_comp;
+l_float32  ratio1, ratio2, factor;
+L_BMF     *bmf;
+BOX       *box1;
+BOXA      *boxa1, *boxa2, *boxa3, *boxa4, *boxa5;
+PIX       *pix1, *pix2, *pix3, *pix4, *pix5, *pix5a;
+PIX       *pix6, *pix7, *pix8, *pix9, *pix10;
+PIXA      *pixa1, *pixa2;
+SEL       *sel1;
+
+    PROCNAME("pixDecideIfText");
+
+    if (pistext) *pistext = -1;  /* init */
+    if (!pixs)
+        return ERROR_INT("pixs not defined", procName, 1);
+
+        /* Crop and convert to 1 bpp with adaptive background cleaning.
+         * If no box is given, use most of the image.  Removing the
+         * edges helps avoid false negatives from noise near the edges. */
+    if (box) {
+        pix1 = pixClipRectangle(pixs, box, NULL);
+    } else {
+        pixGetDimensions(pixs, &w, &h, NULL);
+        box1 = boxCreate(w / 10, h / 10, 4 * w / 5, 4 * h / 5);
+        pix1 = pixClipRectangle(pixs, box1, NULL);
+        boxDestroy(&box1);
+    }
+    pix2 = pixConvertTo8(pix1, 0);
+    pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 160);
+    pixDestroy(&pix1);
+    if (!pix3) {
+        pixDestroy(&pix2);
+        L_INFO("pix cleaning failed\n", procName);
+        return 1;
+    }
+    pix4 = pixThresholdToBinary(pix3, 200);
+    pixZero(pix4, &empty);
+    if (empty) {
+        pixDestroy(&pix2);
+        pixDestroy(&pix3);
+        pixDestroy(&pix4);
+        L_INFO("pix is empty\n", procName);
+        return 0;
+    }
+
+        /* Get the resolution, or guess, and scale the image to 300 ppi */
+    if ((res = pixGetXRes(pixs)) == 0) {
+        L_WARNING("Resolution is not set: using 300 ppi\n", procName);
+        res = 300;
+    }
+    if (res != 300) {
+        factor = 300. / res;
+        pix5 = pixScale(pix4, factor, factor);
+    } else {
+        pix5 = pixClone(pix4);
+    }
+    w = pixGetWidth(pix5);
+
+        /* Identify and remove tall, thin vertical lines (as found in tables)
+         * that are up to 9 pixels wide.  Make a hit-miss sel with an
+         * 81 pixel vertical set of hits and with 3 pairs of misses that
+         * are 10 pixels apart horizontally.  It is necessary to use a
+         * hit-miss transform; if we only opened with a vertical line of
+         * hits, we would remove solid regions of pixels that are not
+         * text or vertical lines. */
+    pix5a = pixCreate(11, 81, 1);
+    for (i = 0; i < 81; i++)
+        pixSetPixel(pix5a, 5, i, 1);
+    sel1 = selCreateFromPix(pix5a, 40, 5, NULL);
+    selSetElement(sel1, 20, 0, SEL_MISS);
+    selSetElement(sel1, 20, 10, SEL_MISS);
+    selSetElement(sel1, 40, 0, SEL_MISS);
+    selSetElement(sel1, 40, 10, SEL_MISS);
+    selSetElement(sel1, 60, 0, SEL_MISS);
+    selSetElement(sel1, 60, 10, SEL_MISS);
+    pix6 = pixHMT(NULL, pix5, sel1);
+    pix7 = pixSeedfillBinaryRestricted(NULL, pix6, pix5, 8, 5, 1000);
+    pix8 = pixXor(NULL, pix5, pix7);
+    pixDestroy(&pix5a);
+    selDestroy(&sel1);
+
+        /* Convert the text lines to separate long horizontal components */
+    pix9 = pixMorphCompSequence(pix8, "c30.1 + o15.1 + c60.1 + o2.2", 0);
+
+        /* Estimate the distance to the bottom of the significant region */
+    if (box) {  /* use full height */
+        pixGetDimensions(pix9, NULL, &h, NULL);
+    } else {  /* use height of region that has text lines */
+        pixFindThreshFgExtent(pix9, 400, NULL, &h);
+    }
+
+    if (pixadb) {
+        bmf = bmfCreate(NULL, 8);
+        pixaAddPixWithText(pixadb, pix2, 1, bmf, "initial 8 bpp",
+                           0x0000ff00, L_ADD_BELOW);
+        pixaAddPixWithText(pixadb, pix3, 1, bmf, "with background cleaning",
+                           0x0000ff00, L_ADD_BELOW);
+        pixaAddPixWithText(pixadb, pix4, 1, bmf, "threshold to binary",
+                           0x0000ff00, L_ADD_BELOW);
+        pixaAddPixWithText(pixadb, pix6, 2, bmf, "hit-miss for vertical line",
+                           0x0000ff00, L_ADD_BELOW);
+        pixaAddPixWithText(pixadb, pix7, 2, bmf, "restricted seed-fill",
+                           0x0000ff00, L_ADD_BELOW);
+        pixaAddPixWithText(pixadb, pix8, 2, bmf, "remove using xor",
+                           0x0000ff00, L_ADD_BELOW);
+        pixaAddPixWithText(pixadb, pix9, 2, bmf, "make long horiz components",
+                           0x0000ff00, L_ADD_BELOW);
+    }
+
+        /* Extract the connected components */
+    if (pixadb) {
+        boxa1 = pixConnComp(pix9, &pixa1, 8);
+        pix10 = pixaDisplayRandomCmap(pixa1, 0, 0);
+        pixcmapResetColor(pixGetColormap(pix10), 0, 255, 255, 255);
+        pixaAddPixWithText(pixadb, pix10, 2, bmf, "show connected components",
+                           0x0000ff00, L_ADD_BELOW);
+        pixDestroy(&pix10);
+        pixaDestroy(&pixa1);
+        bmfDestroy(&bmf);
+    } else {
+        boxa1 = pixConnComp(pix9, NULL, 8);
+    }
+
+        /* Analyze the connected components.  The following conditions
+         * at 300 ppi must be satisfied if the image is text:
+         * (1) There are no components that are wider than 400 pixels and
+         *     taller than 175 pixels.
+         * (2) The second longest component is at least 60% of the
+         *     (possibly cropped) image width.  This catches images
+         *     that don't have any significant content.
+         * (3) Of the components that are at least 40% of the length
+         *     of the longest (n2), at least 80% of them must not exceed
+         *     60 pixels in height.
+         * (4) The number of those long, thin components (n3) must
+         *     equal or exceed a minimum that scales linearly with the
+         *     image height.
+         * Most images that are not text fail more than one of these
+         * conditions. */
+    boxa2 = boxaSort(boxa1, L_SORT_BY_WIDTH, L_SORT_DECREASING, NULL);
+    boxaGetBoxGeometry(boxa2, 1, NULL, NULL, &maxw, NULL);  /* 2nd longest */
+    boxa3 = boxaSelectBySize(boxa1, 0.4 * maxw, 0, L_SELECT_WIDTH,
+                             L_SELECT_IF_GTE, NULL);
+    boxa4 = boxaSelectBySize(boxa3, 0, 60, L_SELECT_HEIGHT,
+                             L_SELECT_IF_LTE, NULL);
+    boxa5 = boxaSelectBySize(boxa1, 400, 175, L_SELECT_IF_BOTH,
+                             L_SELECT_IF_GT, NULL);
+    big_comp = (boxaGetCount(boxa5) == 0) ? 0 : 1;
+    n1 = boxaGetCount(boxa1);
+    n2 = boxaGetCount(boxa3);
+    n3 = boxaGetCount(boxa4);
+    ratio1 = (l_float32)maxw / (l_float32)w;
+    ratio2 = (l_float32)n3 / (l_float32)n2;
+    minlines = L_MAX(2, h / 125);
+    if (big_comp || ratio1 < 0.6 || ratio2 < 0.8 || n3 < minlines)
+        *pistext = 0;
+    else
+        *pistext = 1;
+    if (pixadb) {
+        if (*pistext == 1) {
+            L_INFO("This is text: \n  n1 = %d, n2 = %d, n3 = %d, "
+                   "minlines = %d\n  maxw = %d, ratio1 = %4.2f, h = %d, "
+                   "big_comp = %d\n", procName, n1, n2, n3, minlines,
+                   maxw, ratio1, h, big_comp);
+        } else {
+            L_INFO("This is not text: \n  n1 = %d, n2 = %d, n3 = %d, "
+                   "minlines = %d\n  maxw = %d, ratio1 = %4.2f, h = %d, "
+                   "big_comp = %d\n", procName, n1, n2, n3, minlines,
+                   maxw, ratio1, h, big_comp);
+        }
+    }
+
+    boxaDestroy(&boxa1);
+    boxaDestroy(&boxa2);
+    boxaDestroy(&boxa3);
+    boxaDestroy(&boxa4);
+    boxaDestroy(&boxa5);
+    pixDestroy(&pix2);
+    pixDestroy(&pix3);
+    pixDestroy(&pix4);
+    pixDestroy(&pix5);
+    pixDestroy(&pix6);
+    pixDestroy(&pix7);
+    pixDestroy(&pix8);
+    pixDestroy(&pix9);
+    return 0;
+}
+
+
+/*!
+ *  pixFindThreshFgExtent()
+ *
+ *      Input:  pixs (1 bpp)
+ *              thresh (threshold number of pixels in row)
+ *              &top (<optional return> location of top of region)
+ *              &bot (<optional return> location of bottom of region)
+ *      Return: 0 if OK, 1 on error
+ */
+l_int32
+pixFindThreshFgExtent(PIX      *pixs,
+                      l_int32   thresh,
+                      l_int32  *ptop,
+                      l_int32  *pbot)
+{
+l_int32    i, n, res;
+l_int32   *array;
+l_float32  factor;
+NUMA      *na;
+
+    PROCNAME("pixFindThreshFgExtent");
+
+    if (ptop) *ptop = 0;
+    if (pbot) *pbot = 0;
+    if (!ptop && !pbot)
+        return ERROR_INT("nothing to determine", procName, 1);
+    if (!pixs || pixGetDepth(pixs) != 1)
+        return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
+
+    na = pixCountPixelsByRow(pixs, NULL);
+    n = numaGetCount(na);
+    array = numaGetIArray(na);
+    if (ptop) {
+        for (i = 0; i < n; i++) {
+            if (array[i] >= thresh) {
+                *ptop = i;
+                break;
+            }
+        }
+    }
+    if (pbot) {
+        for (i = n - 1; i >= 0; i--) {
+            if (array[i] >= thresh) {
+                *pbot = i;
+                break;
+            }
+        }
+    }
+    LEPT_FREE(array);
+    numaDestroy(&na);
+    return 0;
+}
+