#!/usr/bin/perl
# This file can find requirements of html and jhtml files (cgi, gif,
# java dependencies). It is a bit of a hack but it turns out to work
# well. We track only dependencies between Relative URLs, absolute
# URL's are assumed to be extenernal to the RPM system. We do not
# parse the HTML but look through the set of strings (text surrounded
# by quotes) for something which looks like a reference. This avoids
# writing a full HTML parsers and tends to work really well. In this
# manner we can track dependencies for: href, src, action and other
# HTML tags which have not been invented yet.
# The reference:
#
# href="http://www.perl.org/images/arrow.gif"
#
# does not create a dependency but the reference
#
# href="images/arrow.gif"
#
# will create a dependency.
# Additionally this program will find the requirements for sun jhtml
# (html with embedded java) since jhtml is deprecated so is this part
# of the code.
use File::Basename;
# this is the pattern of extensions to call requirements
$DEPS_PAT = '\.((cgi)|(ps)|(pdf)|(png)|(jpg)|(gif)|(tiff)|(tif)|(xbm)|(html)|(htm)|(shtml)|(jhtml))$'; #'
if ("@ARGV") {
foreach (@ARGV) {
process_file($_);
}
} else {
# notice we are passed a list of filenames NOT as common in unix the
# contents of the file.
foreach (<>) {
process_file($_);
}
}
foreach $key (sort keys %seen) {
print "$key\n";
}
sub process_file {
my ($file) = @_;
chomp $file;
open(FILE, "<$file")||
die("$0: Could not open file: '$file' : $!\n");
# we have to suck in the whole file at once because too many people
# split lines around tags.
my (@file) = ;
$_= "@file";
# ignore line based comments ( careful although it has two slashes
# 'http://www.yahoo.com' is not a comment! )
s!^\s*//.*$!!mg;
s!//\s.*$!!mg;
s!\s//.*$!!mg;
# ignore multi-line comments
# (use non greedy operators)
s!/\*.*?\*/!!g;
s///g;
# html references other html documents inside strings. Ignore non
# relative references since these dependencies can not be met. (ie,
# no package you install will ever provide 'http://www.yahoo.com').
# I use basename since I have seen too many http references which
# begin with '../' this would just kill the dependnecy tracking
# mechanism.
while ( m{\"([^\"]+)\"}g ) {
my $string = $1;
chomp $string;
if ( ( $string !~ m!http://! ) &&
( $string =~ m!$DEPS_PAT! ) ) {
$string = basename($string);
$string =~ s!\s+!!g;
$seen{"http(${string})"} = 1;
}
}
{
# This section is only for use with (Sun) jhtml dependencies, and
# since jhtml is deprecated so is this code.
# java imports in jhtml (may have stars for leaf class)
# these may span several lines
while ( m!\s*([^<]+)\s*