diff options
Diffstat (limited to 'tcejdb/lab/htmltotsv')
-rwxr-xr-x | tcejdb/lab/htmltotsv | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/tcejdb/lab/htmltotsv b/tcejdb/lab/htmltotsv new file mode 100755 index 0000000..3221a84 --- /dev/null +++ b/tcejdb/lab/htmltotsv @@ -0,0 +1,102 @@ +#! /usr/bin/perl + +#================================================================ +# htmldoctotsv +# Generate TSV data from HTML documents +#================================================================ + +use strict; +use warnings; +use Encode; +use Cwd 'realpath'; + +my $path = '.'; +my $mode_kv = 0; +my $mode_hex = 0; + +for(my $i = 0; $i < scalar(@ARGV); $i++){ + my $arg = $ARGV[$i]; + if($arg =~ /^-/){ + if($arg eq '-kv'){ + $mode_kv = 1; + } elsif($arg eq '-x'){ + $mode_hex = 1; + } + } else { + $path = $arg; + } +} + +sub trimhtml { + my $text = shift; + $text =~ s/<[^>]*>/ /g; + $text =~ s/</</g; + $text =~ s/>/>/g; + $text =~ s/"/"/g; + $text =~ s/ / /g; + $text =~ s/&/\&/g; + $text =~ s/\s+/ /g; + $text =~ s/^ *//; + $text =~ s/ *$//; + return $text; +} + +$ENV{LANG} = "C"; +$ENV{LC_ALL} = "C"; +open(my $lfh, "find $path -type f -iregex '.*\.html?' -print | sort |") || die("could not open"); +my $id = 0; +while(defined($path = <$lfh>)){ + chomp($path); + $path = realpath($path); + next if(!defined($path)); + my @stat = stat($path); + next if(scalar(@stat) < 10); + my $mtime = $stat[9]; + open(my $ifh, "<$path") || next; + my $encname = "UTF-8"; + my @lines; + while(defined(my $line = <$ifh>)){ + push(@lines, $line); + if($line =~ /<meta.*content-type.*charset=/i){ + $line =~ s/.*charset=?//i; + $line =~ s/[^-_a-zA-Z0-9].*//; + chomp($line); + $encname = $line if(length($line) > 0); + } + } + my $text = join('', @lines); + $text = encode("UTF-8", decode($encname, $text)) if($encname ne "UTF-8"); + $text =~ s/<!--.*?-->//is; + my $title = ""; + if($text =~ /<title[^>]*>[^<]*<\/title>/i){ + $title = $text; + $title =~ s/.*<title[^>]*>([^<]*)<\/title>.*/$1/is; + $title = trimhtml($title); + } + $text =~ s/.*<body[^>]*>(.*)<\/body>.*/$1/is; + $text =~ s/<style[^>]*>.*?<\/style>//is; + $text =~ s/<script[^>]*>.*?<\/script>//is; + $text = trimhtml($text); + next if(length($title) < 1 && length($text) < 1); + $id++; + my $key = $mode_hex ? sprintf("%X", $id) : $id; + printf STDERR ("%d: saving: %s\n", $id, $path); + if($mode_kv){ + $text = $title . " " . $text if(length($title) > 0); + printf("%s\t%s\n", $key, $text); + } else { + printf("%s", $key); + printf("\turl\t%s", $path); + printf("\tsize\t%s", length($text)); + printf("\tmtime\t%s", $mtime); + printf("\ttitle\t%s", $title) if(length($title) > 0); + printf("\tbody\t%s", $text); + printf("\n"); + } + close($ifh); +} +close($lfh); + + + +# END OF FILE |