1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
#! /usr/bin/perl
#================================================================
# htmldoctotsv
# Generate TSV data from HTML documents
#================================================================
use strict;
use warnings;
use Encode;
use Cwd 'realpath';
my $path = '.';
my $mode_kv = 0;
my $mode_hex = 0;
for(my $i = 0; $i < scalar(@ARGV); $i++){
my $arg = $ARGV[$i];
if($arg =~ /^-/){
if($arg eq '-kv'){
$mode_kv = 1;
} elsif($arg eq '-x'){
$mode_hex = 1;
}
} else {
$path = $arg;
}
}
sub trimhtml {
my $text = shift;
$text =~ s/<[^>]*>/ /g;
$text =~ s/</</g;
$text =~ s/>/>/g;
$text =~ s/"/"/g;
$text =~ s/ / /g;
$text =~ s/&/\&/g;
$text =~ s/\s+/ /g;
$text =~ s/^ *//;
$text =~ s/ *$//;
return $text;
}
$ENV{LANG} = "C";
$ENV{LC_ALL} = "C";
open(my $lfh, "find $path -type f -iregex '.*\.html?' -print | sort |") || die("could not open");
my $id = 0;
while(defined($path = <$lfh>)){
chomp($path);
$path = realpath($path);
next if(!defined($path));
my @stat = stat($path);
next if(scalar(@stat) < 10);
my $mtime = $stat[9];
open(my $ifh, "<$path") || next;
my $encname = "UTF-8";
my @lines;
while(defined(my $line = <$ifh>)){
push(@lines, $line);
if($line =~ /<meta.*content-type.*charset=/i){
$line =~ s/.*charset=?//i;
$line =~ s/[^-_a-zA-Z0-9].*//;
chomp($line);
$encname = $line if(length($line) > 0);
}
}
my $text = join('', @lines);
$text = encode("UTF-8", decode($encname, $text)) if($encname ne "UTF-8");
$text =~ s/<!--.*?-->//is;
my $title = "";
if($text =~ /<title[^>]*>[^<]*<\/title>/i){
$title = $text;
$title =~ s/.*<title[^>]*>([^<]*)<\/title>.*/$1/is;
$title = trimhtml($title);
}
$text =~ s/.*<body[^>]*>(.*)<\/body>.*/$1/is;
$text =~ s/<style[^>]*>.*?<\/style>//is;
$text =~ s/<script[^>]*>.*?<\/script>//is;
$text = trimhtml($text);
next if(length($title) < 1 && length($text) < 1);
$id++;
my $key = $mode_hex ? sprintf("%X", $id) : $id;
printf STDERR ("%d: saving: %s\n", $id, $path);
if($mode_kv){
$text = $title . " " . $text if(length($title) > 0);
printf("%s\t%s\n", $key, $text);
} else {
printf("%s", $key);
printf("\turl\t%s", $path);
printf("\tsize\t%s", length($text));
printf("\tmtime\t%s", $mtime);
printf("\ttitle\t%s", $title) if(length($title) > 0);
printf("\tbody\t%s", $text);
printf("\n");
}
close($ifh);
}
close($lfh);
# END OF FILE
|