summaryrefslogtreecommitdiff
path: root/tcejdb/lab/htmltotsv
blob: 3221a844a7663937af641af3a8e2a2b11ce62a14 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/perl

#================================================================
# htmldoctotsv
# Generate TSV data from HTML documents
#================================================================

use strict;
use warnings;
use Encode;
use Cwd 'realpath';

my $path = '.';
my $mode_kv = 0;
my $mode_hex = 0;

for(my $i = 0; $i < scalar(@ARGV); $i++){
    my $arg = $ARGV[$i];
    if($arg =~ /^-/){
        if($arg eq '-kv'){
            $mode_kv = 1;
        } elsif($arg eq '-x'){
            $mode_hex = 1;
        }
    } else {
        $path = $arg;
    }
}

sub trimhtml {
    my $text = shift;
    $text =~ s/<[^>]*>/ /g;
    $text =~ s/&lt;/</g;
    $text =~ s/&gt;/>/g;
    $text =~ s/&quot;/"/g;
    $text =~ s/&nbsp;/ /g;
    $text =~ s/&amp;/\&/g;
    $text =~ s/\s+/ /g;
    $text =~ s/^ *//;
    $text =~ s/ *$//;
    return $text;
}

$ENV{LANG} = "C";
$ENV{LC_ALL} = "C";
open(my $lfh, "find $path -type f -iregex '.*\.html?' -print | sort |") || die("could not open");
my $id = 0;
while(defined($path = <$lfh>)){
    chomp($path);
    $path = realpath($path);
    next if(!defined($path));
    my @stat = stat($path);
    next if(scalar(@stat) < 10);
    my $mtime = $stat[9];
    open(my $ifh, "<$path") || next;
    my $encname = "UTF-8";
    my @lines;
    while(defined(my $line = <$ifh>)){
        push(@lines, $line);
        if($line =~ /<meta.*content-type.*charset=/i){
            $line =~ s/.*charset=?//i;
            $line =~ s/[^-_a-zA-Z0-9].*//;
            chomp($line);
            $encname = $line if(length($line) > 0);
        }
    }
    my $text = join('', @lines);
    $text = encode("UTF-8", decode($encname, $text)) if($encname ne "UTF-8");
    $text =~ s/<!--.*?-->//is;
    my $title = "";
    if($text =~ /<title[^>]*>[^<]*<\/title>/i){
        $title = $text;
        $title =~ s/.*<title[^>]*>([^<]*)<\/title>.*/$1/is;
        $title = trimhtml($title);
    }
    $text =~ s/.*<body[^>]*>(.*)<\/body>.*/$1/is;
    $text =~ s/<style[^>]*>.*?<\/style>//is;
    $text =~ s/<script[^>]*>.*?<\/script>//is;
    $text = trimhtml($text);
    next if(length($title) < 1 && length($text) < 1);
    $id++;
    my $key = $mode_hex ? sprintf("%X", $id) : $id;
    printf STDERR ("%d: saving: %s\n", $id, $path);
    if($mode_kv){
        $text = $title . " " . $text if(length($title) > 0);
        printf("%s\t%s\n", $key, $text);
    } else {
        printf("%s", $key);
        printf("\turl\t%s", $path);
        printf("\tsize\t%s", length($text));
        printf("\tmtime\t%s", $mtime);
        printf("\ttitle\t%s", $title) if(length($title) > 0);
        printf("\tbody\t%s", $text);
        printf("\n");
    }
    close($ifh);
}
close($lfh);



# END OF FILE