#!/usr/bin/perl -w
# tree.pl. Creates a structured HTML list ('sitemap') of files.
# Copyright (C) 1998-2000 Daniel Naber <dnaber@mini.gt.owl.de>
# version 1.19, 2000-08-27 (version number is independent from java version)
# Latest version: http://www.ev-stift-gymn.guetersloh.de/server/tree_e.html
#
# HOW TO USE THIS:
# 1. On Unix, check if the path to perl in the very first line is correct.
# 2. Go to the configuration section and adapt the values. Note that
#  the most important values are at the top, so if something
#  fails the error is probably there.
# 3. On Unix, set execute permission: chmod u+x tree.pl
# 4. Call the script from command line: ./tree.pl [htmldir] >output.html
#  or call it as CGI (set $cgi = 1): http://yourdomain/cgi-bin/tree.pl
#  (if it doesn't work as CGI, see:
#  http://www.perl.com/CPAN-local/doc/FAQs/cgi/idiots-guide.html)
#
# TODO/BUGS/PROBLEMS:
# -add options $listdate, $listtime
# -links beneath $htmldir will be ignored
# -due to stupid program design, you cannot make a tree that only 
#	consists of files given in @indexfiles
# -html output has a small flaw: <ul>\n</ul>, i.e. an "empty list",
#	as weblint complaints - most browsers donīt care
# -ignores meta tags for robots
#
# CHANGES (important ones):
# 1997-09-07: first version
# 1999-08-29: added $filetag_end, thanks to Doug Melton; small html cleanup
# 1999-09-15: put out more $filetag_end, thanks  to Georg Bischof <bischof@dkrz.de>
# 1999-12-03: trailing slash now auotmatically chopped off, thanks to
#	Mark Williams <miw@SLAC.Stanford.EDU>
# 2000-04-04: print out warning if $htmldir isn't an absolute path;
#	getdate() now uses a more "modern" way to get the date
# 2000-05-10: files without <title> are now also linked (only if
#	$listwithouttitle = 1, thanks to SKiNNY <skinny1@xs4all.nl>);
#	added 'BASE' documentation to tree-template.html
# 2000-08-09: URLs with special chars are now encoded, thanks
#	 to Martin Konicsek <M.Konicsek@logics.de>
#
# COPYRIGHT:
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

# =================================================================================================
# === user-configurable options ===================================================================
# =================================================================================================

if( $ARGV[0] ) {
	$htmldir = $ARGV[0];
} else {
	# directory with the html files (may be overriden by command line argument)
	# on Windows: use the slash (/) instead of double backslash (\\)
	$htmldir = "/raid3/new062001";
}

# take this file as a template to build the output page (try to use an absolute path
# if the script fails, escpecially as CGI):
$templatefile = "/raid1/cdroms/sitemap.programs/tree-template.html";
# comment in the next line (unmodified) and you'll just get the list:
#$templatefile = "";

$cgi = 0;		# use this as a cgi script? (0=no, 1=yes)
$baseurl = "http://rubens.anu.edu.au/new.japan";		# this will be the first part of any URL (normally you can leave it empty)

@indexfiles = ('index.html', 'index.shtml');		# default-files' names
@inchtml = ('shtml', 'html', 'htm');			# take files with these suffixes as HTML files
#@incpics = ('gif', 'jpg', 'jpeg', 'png');		# include pictures with these suffixes
@incpics = ();						# don't include pictures
$listsize = 0;						# include size in kb for every file? (0=no, 1=yes)

# set the following options both to 1 to generate a list you can use offline:
$offline = 0;		# make links offline-enabled (use from local disk) (0=no, 1=yes)
$indexrefs = 1;		# make links to 'dir/index.html' etc. (instead of 'dir/') (0=no, 1=yes)

# do only include these files/directories, use '*' as a wildcard,
# use '@includefiles = ();' to  include all files except those in @excludefiles:
@includefiles = ();
# do not include these files/directories:
@excludefiles = ('/secret/*', '/anothersecret/*');

# include html files without <title>...</title> resp. with empty title? (0=no, 1=yes):
$listwithouttitle = 1;

# output file. Set path relative to $htmldir. This is only so that the generated list
# won't have a link to itself:
$self = "/about/sitemap.html";
$selftitle = "Sitemap";			# how to call this script's output in the sitemap

#$date = "YEAR-MONTH-DAY";		# date according to ISO 8601
$date = "DAY.MONTH.YEAR";		# german format
#$date = "MONTH/DAY/YEAR";		# american format

# for those of you who like the plain output:
#$dirtag      = '<ul>';
#$dirtag_end  = '</ul>';
#$foldertag   = '<li>';
#$htmltag     = '<li>';
#$pictag      = '<li>';
#$nolinktag   = '<li>';
#$filetag_end = '</li>';

# for those who like output with an icon in front of every item
# (you better also set the width and height):
$dirtag      = '<dl>';
$dirtag_end  = '</dl>';
$foldertag   = '<dt><img src="ballred.gif" height=8>';
#$htmltag     = '<dt><img src="tree_img/generic.gif" alt="html file">';
#$pictag      = '<dt><img src="tree_img/image2.gif" alt="picture">';
#$nolinktag   = '<dt><img src="tree_img/folder.open.gif" alt="other file">';
$filetag_end = '</dt>';

# how to mark files that changed not long ago:
#$modifiedtag = '<img src="/images/new.gif" alt="page updated recently" hspace="3">';
$modifiedtime = 3*24;		# mark files that are not older than $modifiedtime hours (0 = option off)

# =================================================================================================
# === no configuration below ======================================================================
# =================================================================================================

# the prototypes:
sub getdate();
sub init();
sub first_part_output();
sub doperfile();
sub list_output();
sub last_part_output();
sub getdefaultfile($);
sub getinfo($$);
sub is_it_modified($);
sub getsize($);
sub isfileclass($@);
sub isfile($@);
sub load_part($);
sub load($);
sub minzero($);

# $st = time();		# comment in if you're interested in runtime
use URI::Escape;
use File::Find;
$depth = 0;
($htmlct, $htmlsize, $picct, $picsize) = (0, 0, 0, 0);	# count size und number
$partlist = "";
getdate();
init();
first_part_output();
find(\&doperfile, $htmldir);
list_output();
last_part_output();
# $diff = time() - $st; print STDERR "time: $diff secs\n";	# see above
exit;

# --------------------------------------------------------------------------

# Sets the global variable $date to the date that's now.
sub getdate() {
	use Time::localtime;
	my $year = localtime->year() + 1900;
	my $mon = localtime->mon() + 1;
	my $day = localtime->mday();
	$mon = "0".$mon if( length($mon) == 1 );
	$day = "0".$day if( length($day) == 1 );
	$date =~ s#YEAR#$year#i;
	$date =~ s#MONTH#$mon#i;
	$date =~ s#DAY#$day#i;
}

# Does some checks and notifies the user of problems.
sub init() {
	if( $cgi ) {
		select(STDOUT); $| = 1;
		$nph = 1 if( $0 =~ m#nph-tree# );
		print "HTTP/1.0 200 OK\n" if( $nph );
		print "Content-Type: text/html\n\n";
	}
	if( ! -d $htmldir ) {
		die "Error: $0: '$htmldir' does not exist, is not a directory or is not readable.\n";
	}
	if( -l $htmldir ) {
		die "Error: $0: cannot start on a symbolic link (symbolic links\n below the html directory will be ignored anyway): '$htmldir'.\n";
		exit;
	}
	if( $htmldir !~ m#^/# ) {
		print STDERR "Warning: $0: Try setting \$htmldir to an absolute path for better results.\n";
	}
	if( $htmldir =~ m#/$# ) {		# possibly remove trailing slash
		chop($htmldir);
	}
	my $expat;				# enable '*' as wildcard in @excludefiles
	foreach $expat (@excludefiles) {
		$expat =~ s#\*#.*?#g;
	}
	foreach $expat (@includefiles) {	# the same in @includefiles
		$expat =~ s#\*#.*?#g;
	}
}

# Print out the first part of the list. Loads template file, substitutes date from the template.
sub first_part_output() {
	if( $templatefile ) {
		$output = load($templatefile);
	} else {
		$output = "";
	}
	$output =~ s#<!-- \$date -->#$date#igs;
	my ($first_part) = ($output =~ m#^(.*?)<!-- \$list -->#is);
	$first_part = "" if ( ! defined($first_part) );		# avoid warning
	print $first_part;
}

# This is the function connected to 'find', i.e. it's called once for every file in the tree.
sub doperfile() {
	my $thisfile = $File::Find::name;			# the filename that Find provides for us
	$thisfile .= "/" if( -d $thisfile );
	my ($thisfile_rel) = ($thisfile =~ m#^$htmldir(/.*)#);	# part after $htmldir
	if( ! $thisfile_rel ) {			# works better with relative path in $htmldir
		$thisfile_rel = ".";
	}
	my $expat;
	# include only files from @includefiles:
	if( scalar(@includefiles) >= 1 ) {
		my $do_use = 0;
		foreach $expat (@includefiles) {
			if( $thisfile_rel =~ m#^$expat$# ) {
				$do_use = 1;
				last;
			}
		}
		return if( ! $do_use );
	}
	
	# exclude files from @excludefiles:
	foreach $expat (@excludefiles) {
		return if( $thisfile_rel =~ m#^$expat$# );
	}

	if( ! isfile($thisfile, @indexfiles)
		&& ($thisfile =~ m#/$# || isfileclass($thisfile, @inchtml) || isfileclass($thisfile, @incpics)) ) {
		push(@filelist, $thisfile);
	}
}

# Make the list.
sub list_output() {
	my $thisfile;
	my $dirsdone = "";			# have we been here already?
	my $thisdir = "";
	my $dirtag_ct = 0;
	@filelist = sort(@filelist);
	print "$dirtag\n";
	$dirtag_ct++;
	# This is the loop which generated the nested list:
	foreach $thisfile (@filelist) {
		($url) = ($thisfile =~ m#$htmldir(/.*)#i);
		$olddepth = $depth;
		$depth = ($url =~ s#/#/#gi);    		# 1 = html-root
		$olddir = $thisdir;
		($thisdir) = ($url =~ m#(.*/).*?#i);
		if( $thisdir ne $olddir && ! ($dirsdone =~ m#^\Q$thisdir\E$#m) ) {	# deeper level or same level
			$dirsdone .= "$thisdir\n";
			$partlist .= "$dirtag_end\n" x ($olddepth-$depth+1);
			$dirtag_ct -= minzero($olddepth-$depth+1);
			$partlist .= " $nolinktag$baseurl$url$filetag_end\n" if( ! getdefaultfile("$htmldir$thisdir") );
			$partlist .= "$dirtag\n";
			$dirtag_ct++;
		} elsif( ! ($thisdir =~ m#\Q$olddir\E#i) ) {	# higher level
			$partlist .= "$dirtag_end\n" x ($olddepth-$depth);
			$dirtag_ct -= minzero($olddepth-$depth);
			$partlist .= getinfo($thisfile, 0);
		} else {					# same level as before
			$partlist .=  getinfo($thisfile, 0);
			$partlist =~ s#$dirtag\n$dirtag_end\n##ig;	# clean up HTML
			print $partlist;
			$partlist = "";
		}
	}
	print $partlist;
	# close list correctly:
	print "$dirtag_end\n" x $dirtag_ct;
}

# Prints the last part of the list. Substitutes some strings that might be in the template.
sub last_part_output() {
	$htmlsize = int($htmlsize/1000);		# size in kB
	$picsize = int($picsize/1000);
	$output =~ s#<!-- \$htmlct -->#$htmlct#igs;
	$output =~ s#<!-- \$htmlsize -->#$htmlsize#igs;
	$output =~ s#<!-- \$picsct -->#$picct#igs;
	$output =~ s#<!-- \$picsize -->#$picsize#igs;
	my ($last_part) = ($output =~ m#<!-- \$list -->(.*)$#is);
	$last_part = "" if ( ! defined($last_part) );	# avoid warning
	print $last_part;
}

# --------------------------------------------------------------------------

# Get default file (e.g. index.html) for a directory. Return 0 if there is no default file.
sub getdefaultfile($) {
	my $dir = shift;
	my $item;
	foreach $item (@indexfiles) {
	 	if( -e "$dir$item" ) {		# there's a defaultfile
			$partlist .= getinfo("$dir$item", 1);
			return $item;
		}
	}
	return 0;
}

# Get the string that will make up a file's entry in the list.
sub getinfo($$) {
	my $thisfile = shift;
	my $isindexfile = shift;
	my ($suffix) = ($thisfile =~ m#.*\.(.*)#);
	my ($size, $exactsize) = getsize($thisfile);
	my $entry = "";
	my $linkurl;
	$offline ? ($linkurl = $htmldir.$url) : ($linkurl = $url);
	$linkurl = $baseurl.$linkurl;
	if( isfileclass($thisfile, @inchtml) ) {
		$htmlsize += $exactsize;
		$htmlct++;
		my $string = load_part($thisfile);
		my $icontag = " $htmltag";
		$icontag = " $foldertag" if( $isindexfile );
		if( $thisfile eq "$htmldir$self" ) {			# output file itself
			$entry .= " $nolinktag$selftitle$filetag_end";
		} else {
			my ($titlestring) = ($string =~ m#<title>(.*?)</title>#is);	# common case
			if( $listwithouttitle && (! $titlestring || $titlestring =~ m/^\s*$/) ) {
				$titlestring = $baseurl.$url;
			}
			if( $titlestring && $titlestring !~ m/^\s*$/ ) {
				$entry .= $icontag;
				$entry .= $modifiedtag if( is_it_modified($thisfile) );
				$entry .= "<a href=\"".uri_escape($linkurl)."\">$titlestring";
				$entry .= " ($size&nbsp;kB)" if( $listsize );
				$entry .= "</a>$filetag_end";
			}
		}
		$entry .= "\n";
	} elsif( isfileclass($thisfile, @incpics) ) {
		$picsize += $exactsize;
		$picct++;
		my ($filenameonly) = ($url =~ m#.*/(.*)#i);
		$entry .= " $pictag";
		$entry .= $modifiedtag if( is_it_modified($thisfile) );
		$entry .= "<a href=\"".uri_escape($linkurl)."\">$filenameonly";
		$entry .= " ($size&nbsp;kB)" if( $listsize );
		$entry .= "</a>$filetag_end\n";
	}

	# links to dir/ or to dir/index.html (see configuration section)
	if( $indexrefs ) {
		my ($filepart) = ($thisfile =~ m#.*/(.*)#);
		$entry =~ s#href="(.*?/)"#href="$1$filepart"#i;
	}

	return $entry;
}

# Check if file was modified recently.
sub is_it_modified($) {
	my $filename = shift;
	($mtime) = (stat($filename))[9];
	if( $modifiedtime && ((time() - $mtime) < ($modifiedtime*60*60)) ) {
		return 1;
	} else {
		return 0;
	}
}

# Get file's size in (kB, bytes).
sub getsize($) {
	my $file = shift;
	my $exactsize = -s ($file);
	my $size = int($exactsize/1000);
	$size = 1 if( $size == 0 );
	return $size, $exactsize;
}

# Check if file's suffix is in an array of suffixes.
sub isfileclass($@) {
	my $file = shift;
	my @fileclass = @_;
	my $item;
	foreach $item (@fileclass) {
		return 1 if( $file =~ m#\.$item$# );
	}
	return 0;
}

# Check filename, e.g. if it's an index.html-like file.
sub isfile($@) {
	my $file = shift;
	my @files = @_;
	my $item;
	foreach $item (@files) {
		return 1 if( $file =~ m#/$item$# );
	}
	return 0;
}

# Only load part of a file till </title> is reached.
sub load_part($) {
	my $file = shift;
	my $string = "";
	open(INPUT, "<$file") || die "Error: $0: Cannot open '$file': $!";
	while(<INPUT>) {
		$string .= $_;
		last if( $_ =~ m#</title>#i );
	}
	close(INPUT);
	$string = "" if( ! defined($string) && $file =~ m#^$htmldir/$self$# );	# avoid warning
	return $string;
}

# Load a file. Takes filename, returns file's contents.
sub load($) {
	my $file = shift;
	my $string;
	open(INPUT, "<$file") || die "Error: $0: Cannot open '$file': $!";
	undef $/;
	$string = (<INPUT>);
	$/ = "\n";
	close(INPUT);
	return $string;
}

# Returns 0 if argument is < 0, returns the argument otherwise.
sub minzero($) {
	my $var = shift;
	if( $var > 0 ) {
		return $var;
	} else {
		return 0;
	}
}
