summaryrefslogtreecommitdiff
path: root/tcejdb/lab/htmltotsv
diff options
context:
space:
mode:
Diffstat (limited to 'tcejdb/lab/htmltotsv')
-rwxr-xr-xtcejdb/lab/htmltotsv102
1 files changed, 102 insertions, 0 deletions
diff --git a/tcejdb/lab/htmltotsv b/tcejdb/lab/htmltotsv
new file mode 100755
index 0000000..3221a84
--- /dev/null
+++ b/tcejdb/lab/htmltotsv
@@ -0,0 +1,102 @@
+#! /usr/bin/perl
+
+#================================================================
+# htmldoctotsv
+# Generate TSV data from HTML documents
+#================================================================
+
+use strict;
+use warnings;
+use Encode;
+use Cwd 'realpath';
+
+my $path = '.';
+my $mode_kv = 0;
+my $mode_hex = 0;
+
+for(my $i = 0; $i < scalar(@ARGV); $i++){
+ my $arg = $ARGV[$i];
+ if($arg =~ /^-/){
+ if($arg eq '-kv'){
+ $mode_kv = 1;
+ } elsif($arg eq '-x'){
+ $mode_hex = 1;
+ }
+ } else {
+ $path = $arg;
+ }
+}
+
+sub trimhtml {
+ my $text = shift;
+ $text =~ s/<[^>]*>/ /g;
+ $text =~ s/&lt;/</g;
+ $text =~ s/&gt;/>/g;
+ $text =~ s/&quot;/"/g;
+ $text =~ s/&nbsp;/ /g;
+ $text =~ s/&amp;/\&/g;
+ $text =~ s/\s+/ /g;
+ $text =~ s/^ *//;
+ $text =~ s/ *$//;
+ return $text;
+}
+
+$ENV{LANG} = "C";
+$ENV{LC_ALL} = "C";
+open(my $lfh, "find $path -type f -iregex '.*\.html?' -print | sort |") || die("could not open");
+my $id = 0;
+while(defined($path = <$lfh>)){
+ chomp($path);
+ $path = realpath($path);
+ next if(!defined($path));
+ my @stat = stat($path);
+ next if(scalar(@stat) < 10);
+ my $mtime = $stat[9];
+ open(my $ifh, "<$path") || next;
+ my $encname = "UTF-8";
+ my @lines;
+ while(defined(my $line = <$ifh>)){
+ push(@lines, $line);
+ if($line =~ /<meta.*content-type.*charset=/i){
+ $line =~ s/.*charset=?//i;
+ $line =~ s/[^-_a-zA-Z0-9].*//;
+ chomp($line);
+ $encname = $line if(length($line) > 0);
+ }
+ }
+ my $text = join('', @lines);
+ $text = encode("UTF-8", decode($encname, $text)) if($encname ne "UTF-8");
+ $text =~ s/<!--.*?-->//is;
+ my $title = "";
+ if($text =~ /<title[^>]*>[^<]*<\/title>/i){
+ $title = $text;
+ $title =~ s/.*<title[^>]*>([^<]*)<\/title>.*/$1/is;
+ $title = trimhtml($title);
+ }
+ $text =~ s/.*<body[^>]*>(.*)<\/body>.*/$1/is;
+ $text =~ s/<style[^>]*>.*?<\/style>//is;
+ $text =~ s/<script[^>]*>.*?<\/script>//is;
+ $text = trimhtml($text);
+ next if(length($title) < 1 && length($text) < 1);
+ $id++;
+ my $key = $mode_hex ? sprintf("%X", $id) : $id;
+ printf STDERR ("%d: saving: %s\n", $id, $path);
+ if($mode_kv){
+ $text = $title . " " . $text if(length($title) > 0);
+ printf("%s\t%s\n", $key, $text);
+ } else {
+ printf("%s", $key);
+ printf("\turl\t%s", $path);
+ printf("\tsize\t%s", length($text));
+ printf("\tmtime\t%s", $mtime);
+ printf("\ttitle\t%s", $title) if(length($title) > 0);
+ printf("\tbody\t%s", $text);
+ printf("\n");
+ }
+ close($ifh);
+}
+close($lfh);
+
+
+
+# END OF FILE