# INSTALL:
# You need to be running a web server following the common logfile format
# - CLF - on a modern unix system with Perl 5.002 minimum. If your www
# server is not CFL try edit the $TRANSFER_LOG_FORMAT_REGEXP variable and
# places where it is used in the source.
# Just put this script in a cgi directory (/cgi-bin) with the name
# 'top' (and permissions r-xr-xr-x) and edit the server dependant
# variables a few lines below. Then test it and send me a mail to
# congratulate me or to insult me (I'll prefear the first one).
# This script had been tested only on NCSA & Apache server but
# should work with any server respecting the `common logfiles format'
# and running under a modern Unix system. If you modify it to run
# under something different please be kind enough to inform me of the
# changes you made.
# HISTORY:
# - 3.6: Set Cookie debug again
# Local/extern stuff debug
# - 3.5: Debug the proxy stuff.
# Debug of the parameters passing (ie: the &, + ... stuff)
# Debug the start date pb
# More configurations variables
# use FileHandle sometimes
# Better CL management but this is a gross tricky thing
# New `help' argument prints ... some help
# Do not load the whole file now, but line by line reading.
# - 3.4: Bug when logfile is empty corrected.
# Bug of incorrect detection of re arg corrected.
# Adding showconf arg - for local users only.
# Cleaning of some names
# - 3.3: Maintainance version:
# misc debug and improvements, cleaning ...
# - 3.2: Adding & as args separator (for use in FORMS)
# Booleans args could have values 1 or on & 0 or off.
# - 3.1: Some new spiders added.
# Bug correction for use with server cache.
# Display of size of accessed documents
# Many strings comparisons are now case insensitive
# Yes some people still use gopher
# Some HTLM corrections
# - 3.0: Some minor debugging.
# - 2.9: Presentation could be filtered by regexp
# - 2.8: Cookies debug
# - 2.7: Better setting of server port#
# Trying Cookies.
# Unused MAIN caches cleaned when we've time.
# - 2.6: Normalizing urls with #stuff in it
# Filtering of request to web server cache proxy.
# - 2.5: CGI-BIN dir configurable
# More minor Cosmetics changes.
# - 2.4: Callers regexp matchings are now case insensitive.
# Bug correction in filename normalization.
# Configurable WWW Server port number.
# - 2.3: Minor bugs correction in filter list.
# New args: max, recache.
# Parano>1 do not display errors.
# Cosmetics changes.
# More comments in code.
# More portable way to extract date and robots infos.
# - 2.2: Dealing with errors.
# - 2.1: First public release
# TODO:
# - Using DB instead of plain files for CACHES?
# - Need some performances improvements
# ------------------------------------------------------------------------
#use diagnostics -verbose;
#use strict;
use Shell qw(date domainname hostname);
use FileHandle;
require 5.002;
my $author_email = "boubaker\@cena.dgac.fr";
my $author_home = "http://www.cenatls.cena.dgac.fr/~boubaker/";
my $script_version = "3.6";
my $script_name = "top";
my $script_home = $author_home . "WWW/Top.html";
# ------------------------------------------------------------------------
# Server dependant configuration variables: webmaster should modify these
# ------------------------------------------------------------------------
# Root directory of your WWW server (the httpd ServerRoot in httpd.conf)
my $SERVER_ROOT = "/var/www/server";
#my $SERVER_ROOT = "/alt/etc/httpd"; #.www
# Root directory of your WWW server logs files
my $LOGS_ROOT = $SERVER_ROOT . "/logs";
#my $LOGS_ROOT = "/var/adm/httpd"; #.www
# The access log file from the SERVER_ROOT (the httpd TransferLog in httpd.conf)
my $TRANSFER_LOG = $LOGS_ROOT . "/access_log";
#my $TRANSFER_LOG = $LOGS_ROOT . "/access"; #.www
# File to use as a pre-written HTML directory index
# (the httpd DirectoryIndex in srm.conf)
my $DIRECTORY_INDEX = "index.html";
# Regexp to normalize files. That is remove / at end of
# directories and remove /$DIRECTORY_INDEX so that
# all directories access will be /path/dir instead of
# /path/dir/ or /path/dir/index.html
my $FILENAME_NORMALIZE_REGEXP = "\\/\$\|\\/" . quotemeta($DIRECTORY_INDEX) . "\$";
# The domain of your server. Check if domainname return something
# if not do it by hand: my $LOCAL_DOMAIN = "my.local.domain";
my $LOCAL_DOMAIN = domainname(); chomp $LOCAL_DOMAIN;
# Name of the web server machine, Check if hostname return something
# if not do it by hand: my $WWW_SERVER_NAME = "www";
my $WWW_SERVER_NAME = "www";
#my $WWW_SERVER_NAME = hostname() || "www"; chomp $WWW_SERVER_NAME;
# The port number to access the server
#my $WWW_SERVER_PORT = ":88"; #:88
my $WWW_SERVER_PORT = "";
if ( $ENV{'SERVER_PORT'} && $ENV{'SERVER_PORT'} != 80 ) {
$WWW_SERVER_PORT = ":" . $ENV{'SERVER_PORT'};
}
# Fully qualified server address & url
my $WWW_SERVER_ADDRESS = $WWW_SERVER_NAME . "." . $LOCAL_DOMAIN;
my $WWW_SERVER_URL = "http://" . $WWW_SERVER_ADDRESS . $WWW_SERVER_PORT;
# Server Inet address (the server's IP address)
my $WWW_SERVER_INET_ADDR = join( '.', unpack 'C4', ((gethostbyname $WWW_SERVER_NAME)[4])[0] );
# E-Mail address of this site webmaster
my $WEBMASTER_EMAIL = "webmaster\@$WWW_SERVER_ADDRESS";
# cgi-bin dir
my $WWW_SERVER_CGIBIN = "/cgi-bin";
# cgi-bin to read detailled log access files.
# This is a cgi called with a file name as argument
# whose purpose is to print more detailled file access
# me I use readlog (if you don't have it you could ask me)
# but you can use any one you want or leave it blank.
my $FILELOG_CGI = "";
#my $FILELOG_CGI = $WWW_SERVER_CGIBIN . "/readlog?";
# the full url of this script - top -
my $SCRIPT_PATH = $ENV{"SCRIPT_NAME"} || $WWW_SERVER_CGIBIN . "/" . $script_name;
my $SCRIPT_URL = $WWW_SERVER_URL . $SCRIPT_PATH;
# Paranoiac mode? 0=unconscious, 1=little paranoiac, 2=should see a psychiatrist
# parano >0 = only local users could activate
# the nofilter & nocache arguments
# parano >1 = parano 1 + the nb hits are not differencied
# between local and extern if not a local user,
# the filter by regexp is disabled and there
# is no printing of errors and no size
# informations.
my $PARANO_MODE = 1;
# Local Inet mask used to regexp match
# with cgi env REMOTE_ADDR, this do not
# prevent IP spoofing anyway.
#my $LOCAL_ADDRESS_MASK = "^" . quotemeta( join( '', $WWW_SERVER_INET_ADDR =~ /(^\d+\.\d+\.)\d+\.\d+$/ ));
my $LOCAL_ADDRESS_MASK = "^" . quotemeta( join( '', $WWW_SERVER_INET_ADDR =~ /(^\d+\.\d+\.\d+\.)\d+$/ ));
# Regext to apply to a hostname to see if it comes from local
my $IS_LOCAL_REGEXP = "^[^.]+\$\|" . quotemeta( $LOCAL_DOMAIN ) . "\$\|" . $LOCAL_ADDRESS_MASK;
#my $IS_LOCAL_REGEXP = "^[^.]+\$\|\\.cena\\.dgac\\.fr\$\|" . $LOCAL_ADDRESS_MASK; #.www
# List of Filters Regexp, matched now with /i (case insensitive)
# TODO: Compile as explained in perl faq
my @FILTERS_LIST =
(
"\.\(gif\|jpg\|x[bp]m\|rgba?\)\$", # images files
"/i\(mages\|cons\)[^\w]?", # images dirs
"\.class\$", # Java classes
"\.\(au\|aif[fc]?\|wav\|snd\)\$", # sound files
"^\/robots\.txt\$", # The robots file
"^\/?" . quotemeta( $WWW_SERVER_CGIBIN ), # cgi scripts
"^\/?\/bin", # other scripts
"\.\(bin\|exe\)\$", # exe files
"\/private", # private stuff
);
# Top # MAX & # default number
my $TOP_MAX = 100;
my $TOP_NTH_DEFAULT = 10;
# flock operations, see: man flock
my $LOCK_EX = 2; # exclusive lock
my $LOCK_UN = 8; # unlock
# A fast simple grep program, called as follow
# PG 'strint to match' file
my $FAST_GREP_PG = "fgrep";
# The minimal period to refresh
# the cache update in days
my $CACHE_REFRESH_PERIOD = 1;
# The cache directory
# /tmp is a good place because it is(should be) cleaned at machine reboot
my $TMP_DIR = $ENV{"TMPDIR"} || $ENV{"TEMPDIR"} || $ENV{"TMP"} || $ENV{"TEMP"} || "/tmp";
my $CACHE_DIR = $TMP_DIR . "/" . $script_name . "-" . $script_version . "\@" . $WWW_SERVER_NAME . $WWW_SERVER_PORT;
# The TRANSFER_LOG file format, should extract fields:
# (caller, id, user, date, request, status, bytes)
# where:
# caller = machine.name, (host name or IP address)
# id = identd user name, (user name from identd or -)
# user = http auth user name, (user name from http auth or -)
# date = d/M/Y:T GMT,
# file = Url,
# status = number,
# bytes = number or -
# The *REQUIRED* fields are: caller, date, file & status.
# If your WWW server follow the "Common Log Format" (CLF:
# http://www.w3.org/pub/WWW/Daemon/User/Config/Logging.html#common_logfile_format)
# or if you NOT changed it in the httpd configuration files (LogFormat var) you should have no
# problems with the default regexp (NCSA & Apache servers does), if not just modify it yourself.
# Here is an example of modified regexp for a server which had its LogFormat modified as
# follows (LogFormat "%h %l %u %{%d:%m:%Y}t {%r} %s %b")
my $TRANSFER_LOG_FORMAT_REGEXP = '^(\S+)\s+(\S+)\s+(\S+)\s+\[([^\]]*)\]\s+"GET\s+(\S+)\s+\S+"\s+(\d+)\s+(\S+)';
# ^-^caller ^----^date ^-^file ^-^status^^bytes
#my $TRANSFER_LOG_FORMAT_REGEXP = '^(\S+)\s+(\S+)\s+(\S+)\s+([0-9:]+)\s+{GET\s+(\S+)\s+\S+}\s+(\d+)\s+(\S+)'; #.www
# The following is the same as above but extract only the date
my $TRANSFER_LOG_GETDATE_REGEXP = '^\S+\s+\S+\s+\S+\s+\[([^\]]*)\]\s*';
#my $TRANSFER_LOG_GETDATE_REGEXP = '^\S+\s+\S+\s+\S+\s+([0-9:]+)\s*'; #.www
# HTML code inserted in page output, you can change it
# to specify some include virtual for example if your
# server allows it.
my $HTML_HEAD = "\n";
# Title is put here ...
my $HTML_BODY = "\n\n
";
# Body is put here ...
my $HTML_END = "
\n\n";
# Set umask so that files perms are rw-r-----
umask((umask \& 077) | 7);
# ------------------------------------------------------------------------
# You should have nothing to do below this line
# ------------------------------------------------------------------------
# start measuring time from now
my $top_start_time = $^T;
# Is the user `local' or `alien'
my $top_local_user = 0;
# Top called from cl or as a cgi script
my $top_called_from_cl = 0;
# shut up!
my $top_silent = 0;
# The top # variable
my $top_arg_nth = $TOP_NTH_DEFAULT;
# The Nb entries to skip
my $top_arg_skip = 0;
# Unique user ID
my $top_user_id = 0;
# Use cache?
my $top_arg_nocache = 0;
# Rebuild the cache ?
my $top_arg_recache = 0;
# Filter stuff?
my $top_arg_nofilter = 0;
# User have seen enough
my $top_arg_enough = 0;
# Print config
my $top_arg_showconf = 0;
my $top_arg_help = 0;
# Filter by regexp
my $top_arg_regexp_value = "";
my $top_arg_regexp_is_set = 0;
# Cookies
my $top_cookie_is_ok = 0;
my $top_cookie_name = $script_name . "_UID" . "@" . $WWW_SERVER_NAME . $WWW_SERVER_PORT;
my $top_cookie_value = 0;
# expires in $CACHE_REFRESH_PERIOD days
my $top_cookie_expiration_date = topMs2DateGMT( $top_start_time + ($CACHE_REFRESH_PERIOD*24*3600));
# Allow nocache, nofilter & recache args?,
# Disabled in parano mode >0 if caller
# is not from a local address
my $top_args_allowed = 1;
# Allow printing information about local and
# extern callers, files sizes and errors infos?,
# Disabled in parano mode >1 if caller
# is not from a local address
my $top_print_allstuff = 1;
# do the log file contain size informations
my $top_logs_have_sizes = 1;
# Are we displaying the html now?
my $top_currently_displaying_html = 0;
# The requested range
my $top_low_range = 0;
my $top_high_range = 0;
# Cache vars
# the cache file for the process
my $top_use_cache_MAIN = 0;
my $top_cache_name_MAIN = "";
my $top_cache_file_MAIN = 0;
# the global cache files for all processes
# The GLOBAL & MAIN cache contain informations:
# [file nb_total_hits nb_extern_hits nb_local_hits nb_errors file_size total_size]
my $top_create_cache_GLOBAL = 0;
my $top_cache_name_GLOBAL = "";
my $top_cache_file_GLOBAL = 0;
# cache for logs dates
my $top_use_cache_DATES = 0;
my $top_create_cache_DATES = 0;
my $top_cache_name_DATES = "";
my $top_cache_file_DATES = 0;
# cache for robots access
my $top_use_cache_ROBOTS = 0;
my $top_create_cache_ROBOTS = 0;
my $top_cache_name_ROBOTS = "";
my $top_cache_file_ROBOTS = 0;
# The list of all accessed file (file, [nbhits total, external, local, nerrors])
my %top_all_files_hash = ();
my $top_total_files = 0;
my $top_nb_pages_listed = 0;
my $top_total_nb_pages_listed = 0;
# The list of all accessed files sorted by nbhits total
# (nbhits [file ...])
my %top_ordered_files_hash = ();
my $top_nb_ordered = 0;
# The array of all nb hits sorted from hight to low
my @top_nbhits_sorted_tab;
# The array of top nbhits and the indices
# at which it had been extracted from the
# whole list of sorted nbhits.
my @top_nbhits_subtab;
my $top_nbhits_beg_idx;
my $top_nbhits_end_idx;
# List of seach engine [name, nbhits], initialised with some know
# one and dynamicaly filled with some other (those which read /robots.txt file).
my %top_search_engines_hash =
(
"scooter.pa-x.dec.com" => 1, # Altavista
"tarantula.av.pa-x.dec.com" => 1, # Altavista
"vscooter.av.pa-x.dec.com" => 1, # Altavista
"crimpshrine.atext.com" => 1, # Excite
"crawl2.atext.com" => 1, # Excite
"beastie.atext.com" => 1, # Excite
"france.ecila.com" => 1, # Ecilia
"anodin.ecila.com" => 1, # Ecilia
"josephus.netmind.com" => 1, # URL Minder
"leftcoast.netmind.com" => 1, # URL Minder
"galore-bbn.infoseek.com" => 1, # InfoSeek
"backdraft-bbn.infoseek.com" => 1, # InfoSeek
"southpaw-bbn.infoseek.com" => 1, # InfoSeek
"wilbur-bbn.infoseek.com" => 1, # InfoSeek
"j11.inktomi.com" => 1, # InKtomi
"j12.inktomi.com" => 1, # InKtomi
"lycosidae.lycos.com" => 1, # Lycos
"spider1.srv.pgh.lycos.com" => 1, # Lycos
"spider2.srv.pgh.lycos.com" => 1, # Lycos
"spider3.srv.pgh.lycos.com" => 1, # Lycos
"spider4.srv.pgh.lycos.com" => 1, # Lycos
"spider5.srv.pgh.lycos.com" => 1, # Lycos
"spider6.srv.pgh.lycos.com" => 1, # Lycos
"spider7.srv.pgh.lycos.com" => 1, # Lycos
"aspirateur.inria.fr" => 1, # Inria
"tripod.webcrawler.com" => 1, # WebCrawler
"deep.webcrawler.com" => 1, # WebCrawler
"alcorak.x-echo.com" => 1, # X-Echo
"actarus.x-echo.com" => 1, # X-Echo
);
my $top_search_engines_nb = 0;
# Guess what
my $the_robotstxt_file = "/robots.txt";
# Some Dates
my $top_logfile_creation_date = "";
my $top_date_now = "";
# The script's user
my $top_unknown_user_address = "unknown-user-address";
my $top_user_address = $ENV{"REMOTE_ADDR"} || $top_unknown_user_address;
# the method the script was called with
my $top_meth = $ENV{'REQUEST_METHOD'} || "Unknown method";
my $top_meth_post = ($top_meth eq "POST");
my $dummy = 0;
# output DBG stuff
my $B = "";
my $U = "";
if ( -t ) {
$B = "\033[1m";
$U = "\033[m";
}
# Sub topMs2DateGMT -------------------------------------------------------------
sub topMs2DateGMT {
# topMs2DateGMT( date_in_s_since_epoch );
# Used to set Cookies dates:
# http://home.netscape.com/newsref/std/cookie_spec.html
# This is based on RFC 822, RFC 850, RFC 1036, and RFC 1123,
# with the variations that the only legal time zone is GMT
# and the separators between the elements of the date must be dashes.
my @gm_time = gmtime( $_[0] );
return
(('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')[$gm_time[6]]) . ", " .
$gm_time[3] . "-" .
(('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Noc', 'Dec')[$gm_time[4]]) . "-" .
# This is a year 2000 potential problem!!!
(1900 + $gm_time[5]) . " " .
$gm_time[2] . ":" .
$gm_time[1] . ":" .
$gm_time[0] . " GMT";
}
# Sub topFooterHTML -------------------------------------------------------------
sub topFooterHTML {
# topFooterHTML();
return if $top_silent;
print << "EO_FOOTER";
$script_name v$script_version (c) 1996 - 1998 heddy Boubaker
C.E.N.A
Just ask me or
GO THERE if you want a Free copy of this script.
$HTML_END
EO_FOOTER
$top_currently_displaying_html = 0;
}
# Sub topWarn -------------------------------------------------------------
sub topWarn {
# topWarn( message );
print STDERR "$B WARNING $U: $_[0]\n";
}
# Sub topDBG -------------------------------------------------------------
sub topDBG {
# topDBG( message );
return if ( !$top_called_from_cl );
print "$B DBG $U: $_[0]\n" unless $top_silent;
}
# Sub topError -------------------------------------------------------------
sub topError {
# topError( message );
my ($message) = @_;
if ( $top_called_from_cl ) {
print STDERR "$B ERROR $U: $message\n";
exit( 0 );
}
# called as cgi script outpurt HTML
if ( $top_currently_displaying_html ) {
# flush html
print "