#!/usr/local/bin/perl -- # ------------------------------------------------------------------------ # Top # cgi-bin script: # (c) 1996 - 1998 heddy Boubaker C.E.N.A # Display the top # accessed pages of the WWW server. # VERSION: 3.6 # USAGE: # # # # # - number is the top# (default is 10, see $TOP_NTH_DEFAULT). # - number should be replaced by 'max' which will return the # max # (which is 100 by default, see $TOP_MAX). # - nskip is the number of first entries in the top to skip. # *Advanced users or webmasters should look at the sub # topParseCgiArgs for a complete list of all availables # arguments, see the declarations of related variable # to get a short description of their effect. # *Presentation of result could be filtered by regexp # &re='regexp'. This feature could be removed with parano mode. # *Argument `help' could be usefull too ... # # The script could be run from the command line too just # quote the arguments and pass them as a cgi-scrip, # for ex: top 'recache&nofilter' # - When called from CL there is a new argument 'silent' # which telll top to generate NO output, that could # be nice when putting top in a cron tab... # LICENCE: # This software is totaly freeware except for Microsoft employees. # In this last case the individual licence is 100000 US$ and do not include # any guaranties whatsoever nor any support of any kind nor any rights for any # further release, you've not right to look at the code too and any # little piece of this code is protected by anti-Microsoft copyright, # just pay and shut up ! # DISCLAIMER: USE AT YOUR OWN RISK ! # DETAILS: # Top have a system of dynamic cache, a system of results # filtering, a parano mode, a minimal errors management system # and son on. Everything is configurable look at the variables # below. # AVAILABILITY: # The script has it's own home page where you can find many more # informations about configuration, usage and so on @ # # The latest release could be found at # # The latest release could be seen et (this is on our web server which is # a very busy machine and that you access through the slowest Internet # link in the world, so if you encounter dramatic performances it is # not because of top). # # INSTALL: # You need to be running a web server following the common logfile format # - CLF - on a modern unix system with Perl 5.002 minimum. If your www # server is not CFL try edit the $TRANSFER_LOG_FORMAT_REGEXP variable and # places where it is used in the source. # Just put this script in a cgi directory (/cgi-bin) with the name # 'top' (and permissions r-xr-xr-x) and edit the server dependant # variables a few lines below. Then test it and send me a mail to # congratulate me or to insult me (I'll prefear the first one). # This script had been tested only on NCSA & Apache server but # should work with any server respecting the `common logfiles format' # and running under a modern Unix system. If you modify it to run # under something different please be kind enough to inform me of the # changes you made. # HISTORY: # - 3.6: Set Cookie debug again # Local/extern stuff debug # - 3.5: Debug the proxy stuff. # Debug of the parameters passing (ie: the &, + ... stuff) # Debug the start date pb # More configurations variables # use FileHandle sometimes # Better CL management but this is a gross tricky thing # New `help' argument prints ... some help # Do not load the whole file now, but line by line reading. # - 3.4: Bug when logfile is empty corrected. # Bug of incorrect detection of re arg corrected. # Adding showconf arg - for local users only. # Cleaning of some names # - 3.3: Maintainance version: # misc debug and improvements, cleaning ... # - 3.2: Adding & as args separator (for use in FORMS) # Booleans args could have values 1 or on & 0 or off. # - 3.1: Some new spiders added. # Bug correction for use with server cache. # Display of size of accessed documents # Many strings comparisons are now case insensitive # Yes some people still use gopher # Some HTLM corrections # - 3.0: Some minor debugging. # - 2.9: Presentation could be filtered by regexp # - 2.8: Cookies debug # - 2.7: Better setting of server port# # Trying Cookies. # Unused MAIN caches cleaned when we've time. # - 2.6: Normalizing urls with #stuff in it # Filtering of request to web server cache proxy. # - 2.5: CGI-BIN dir configurable # More minor Cosmetics changes. # - 2.4: Callers regexp matchings are now case insensitive. # Bug correction in filename normalization. # Configurable WWW Server port number. # - 2.3: Minor bugs correction in filter list. # New args: max, recache. # Parano>1 do not display errors. # Cosmetics changes. # More comments in code. # More portable way to extract date and robots infos. # - 2.2: Dealing with errors. # - 2.1: First public release # TODO: # - Using DB instead of plain files for CACHES? # - Need some performances improvements # ------------------------------------------------------------------------ #use diagnostics -verbose; #use strict; use Shell qw(date domainname hostname); use FileHandle; require 5.002; my $author_email = "boubaker\@cena.dgac.fr"; my $author_home = "http://www.cenatls.cena.dgac.fr/~boubaker/"; my $script_version = "3.6"; my $script_name = "top"; my $script_home = $author_home . "WWW/Top.html"; # ------------------------------------------------------------------------ # Server dependant configuration variables: webmaster should modify these # ------------------------------------------------------------------------ # Root directory of your WWW server (the httpd ServerRoot in httpd.conf) my $SERVER_ROOT = "/var/www/server"; #my $SERVER_ROOT = "/alt/etc/httpd"; #.www # Root directory of your WWW server logs files my $LOGS_ROOT = $SERVER_ROOT . "/logs"; #my $LOGS_ROOT = "/var/adm/httpd"; #.www # The access log file from the SERVER_ROOT (the httpd TransferLog in httpd.conf) my $TRANSFER_LOG = $LOGS_ROOT . "/access_log"; #my $TRANSFER_LOG = $LOGS_ROOT . "/access"; #.www # File to use as a pre-written HTML directory index # (the httpd DirectoryIndex in srm.conf) my $DIRECTORY_INDEX = "index.html"; # Regexp to normalize files. That is remove / at end of # directories and remove /$DIRECTORY_INDEX so that # all directories access will be /path/dir instead of # /path/dir/ or /path/dir/index.html my $FILENAME_NORMALIZE_REGEXP = "\\/\$\|\\/" . quotemeta($DIRECTORY_INDEX) . "\$"; # The domain of your server. Check if domainname return something # if not do it by hand: my $LOCAL_DOMAIN = "my.local.domain"; my $LOCAL_DOMAIN = domainname(); chomp $LOCAL_DOMAIN; # Name of the web server machine, Check if hostname return something # if not do it by hand: my $WWW_SERVER_NAME = "www"; my $WWW_SERVER_NAME = "www"; #my $WWW_SERVER_NAME = hostname() || "www"; chomp $WWW_SERVER_NAME; # The port number to access the server #my $WWW_SERVER_PORT = ":88"; #:88 my $WWW_SERVER_PORT = ""; if ( $ENV{'SERVER_PORT'} && $ENV{'SERVER_PORT'} != 80 ) { $WWW_SERVER_PORT = ":" . $ENV{'SERVER_PORT'}; } # Fully qualified server address & url my $WWW_SERVER_ADDRESS = $WWW_SERVER_NAME . "." . $LOCAL_DOMAIN; my $WWW_SERVER_URL = "http://" . $WWW_SERVER_ADDRESS . $WWW_SERVER_PORT; # Server Inet address (the server's IP address) my $WWW_SERVER_INET_ADDR = join( '.', unpack 'C4', ((gethostbyname $WWW_SERVER_NAME)[4])[0] ); # E-Mail address of this site webmaster my $WEBMASTER_EMAIL = "webmaster\@$WWW_SERVER_ADDRESS"; # cgi-bin dir my $WWW_SERVER_CGIBIN = "/cgi-bin"; # cgi-bin to read detailled log access files. # This is a cgi called with a file name as argument # whose purpose is to print more detailled file access # me I use readlog (if you don't have it you could ask me) # but you can use any one you want or leave it blank. my $FILELOG_CGI = ""; #my $FILELOG_CGI = $WWW_SERVER_CGIBIN . "/readlog?"; # the full url of this script - top - my $SCRIPT_PATH = $ENV{"SCRIPT_NAME"} || $WWW_SERVER_CGIBIN . "/" . $script_name; my $SCRIPT_URL = $WWW_SERVER_URL . $SCRIPT_PATH; # Paranoiac mode? 0=unconscious, 1=little paranoiac, 2=should see a psychiatrist # parano >0 = only local users could activate # the nofilter & nocache arguments # parano >1 = parano 1 + the nb hits are not differencied # between local and extern if not a local user, # the filter by regexp is disabled and there # is no printing of errors and no size # informations. my $PARANO_MODE = 1; # Local Inet mask used to regexp match # with cgi env REMOTE_ADDR, this do not # prevent IP spoofing anyway. #my $LOCAL_ADDRESS_MASK = "^" . quotemeta( join( '', $WWW_SERVER_INET_ADDR =~ /(^\d+\.\d+\.)\d+\.\d+$/ )); my $LOCAL_ADDRESS_MASK = "^" . quotemeta( join( '', $WWW_SERVER_INET_ADDR =~ /(^\d+\.\d+\.\d+\.)\d+$/ )); # Regext to apply to a hostname to see if it comes from local my $IS_LOCAL_REGEXP = "^[^.]+\$\|" . quotemeta( $LOCAL_DOMAIN ) . "\$\|" . $LOCAL_ADDRESS_MASK; #my $IS_LOCAL_REGEXP = "^[^.]+\$\|\\.cena\\.dgac\\.fr\$\|" . $LOCAL_ADDRESS_MASK; #.www # List of Filters Regexp, matched now with /i (case insensitive) # TODO: Compile as explained in perl faq my @FILTERS_LIST = ( "\.\(gif\|jpg\|x[bp]m\|rgba?\)\$", # images files "/i\(mages\|cons\)[^\w]?", # images dirs "\.class\$", # Java classes "\.\(au\|aif[fc]?\|wav\|snd\)\$", # sound files "^\/robots\.txt\$", # The robots file "^\/?" . quotemeta( $WWW_SERVER_CGIBIN ), # cgi scripts "^\/?\/bin", # other scripts "\.\(bin\|exe\)\$", # exe files "\/private", # private stuff ); # Top # MAX & # default number my $TOP_MAX = 100; my $TOP_NTH_DEFAULT = 10; # flock operations, see: man flock my $LOCK_EX = 2; # exclusive lock my $LOCK_UN = 8; # unlock # A fast simple grep program, called as follow # PG 'strint to match' file my $FAST_GREP_PG = "fgrep"; # The minimal period to refresh # the cache update in days my $CACHE_REFRESH_PERIOD = 1; # The cache directory # /tmp is a good place because it is(should be) cleaned at machine reboot my $TMP_DIR = $ENV{"TMPDIR"} || $ENV{"TEMPDIR"} || $ENV{"TMP"} || $ENV{"TEMP"} || "/tmp"; my $CACHE_DIR = $TMP_DIR . "/" . $script_name . "-" . $script_version . "\@" . $WWW_SERVER_NAME . $WWW_SERVER_PORT; # The TRANSFER_LOG file format, should extract fields: # (caller, id, user, date, request, status, bytes) # where: # caller = machine.name, (host name or IP address) # id = identd user name, (user name from identd or -) # user = http auth user name, (user name from http auth or -) # date = d/M/Y:T GMT, # file = Url, # status = number, # bytes = number or - # The *REQUIRED* fields are: caller, date, file & status. # If your WWW server follow the "Common Log Format" (CLF: # http://www.w3.org/pub/WWW/Daemon/User/Config/Logging.html#common_logfile_format) # or if you NOT changed it in the httpd configuration files (LogFormat var) you should have no # problems with the default regexp (NCSA & Apache servers does), if not just modify it yourself. # Here is an example of modified regexp for a server which had its LogFormat modified as # follows (LogFormat "%h %l %u %{%d:%m:%Y}t {%r} %s %b") my $TRANSFER_LOG_FORMAT_REGEXP = '^(\S+)\s+(\S+)\s+(\S+)\s+\[([^\]]*)\]\s+"GET\s+(\S+)\s+\S+"\s+(\d+)\s+(\S+)'; # ^-^caller ^----^date ^-^file ^-^status^^bytes #my $TRANSFER_LOG_FORMAT_REGEXP = '^(\S+)\s+(\S+)\s+(\S+)\s+([0-9:]+)\s+{GET\s+(\S+)\s+\S+}\s+(\d+)\s+(\S+)'; #.www # The following is the same as above but extract only the date my $TRANSFER_LOG_GETDATE_REGEXP = '^\S+\s+\S+\s+\S+\s+\[([^\]]*)\]\s*'; #my $TRANSFER_LOG_GETDATE_REGEXP = '^\S+\s+\S+\s+\S+\s+([0-9:]+)\s*'; #.www # HTML code inserted in page output, you can change it # to specify some include virtual for example if your # server allows it. my $HTML_HEAD = "\n"; # Title is put here ... my $HTML_BODY = "\n\n
"; # Body is put here ... my $HTML_END = "
\n\n"; # Set umask so that files perms are rw-r----- umask((umask \& 077) | 7); # ------------------------------------------------------------------------ # You should have nothing to do below this line # ------------------------------------------------------------------------ # start measuring time from now my $top_start_time = $^T; # Is the user `local' or `alien' my $top_local_user = 0; # Top called from cl or as a cgi script my $top_called_from_cl = 0; # shut up! my $top_silent = 0; # The top # variable my $top_arg_nth = $TOP_NTH_DEFAULT; # The Nb entries to skip my $top_arg_skip = 0; # Unique user ID my $top_user_id = 0; # Use cache? my $top_arg_nocache = 0; # Rebuild the cache ? my $top_arg_recache = 0; # Filter stuff? my $top_arg_nofilter = 0; # User have seen enough my $top_arg_enough = 0; # Print config my $top_arg_showconf = 0; my $top_arg_help = 0; # Filter by regexp my $top_arg_regexp_value = ""; my $top_arg_regexp_is_set = 0; # Cookies my $top_cookie_is_ok = 0; my $top_cookie_name = $script_name . "_UID" . "@" . $WWW_SERVER_NAME . $WWW_SERVER_PORT; my $top_cookie_value = 0; # expires in $CACHE_REFRESH_PERIOD days my $top_cookie_expiration_date = topMs2DateGMT( $top_start_time + ($CACHE_REFRESH_PERIOD*24*3600)); # Allow nocache, nofilter & recache args?, # Disabled in parano mode >0 if caller # is not from a local address my $top_args_allowed = 1; # Allow printing information about local and # extern callers, files sizes and errors infos?, # Disabled in parano mode >1 if caller # is not from a local address my $top_print_allstuff = 1; # do the log file contain size informations my $top_logs_have_sizes = 1; # Are we displaying the html now? my $top_currently_displaying_html = 0; # The requested range my $top_low_range = 0; my $top_high_range = 0; # Cache vars # the cache file for the process my $top_use_cache_MAIN = 0; my $top_cache_name_MAIN = ""; my $top_cache_file_MAIN = 0; # the global cache files for all processes # The GLOBAL & MAIN cache contain informations: # [file nb_total_hits nb_extern_hits nb_local_hits nb_errors file_size total_size] my $top_create_cache_GLOBAL = 0; my $top_cache_name_GLOBAL = ""; my $top_cache_file_GLOBAL = 0; # cache for logs dates my $top_use_cache_DATES = 0; my $top_create_cache_DATES = 0; my $top_cache_name_DATES = ""; my $top_cache_file_DATES = 0; # cache for robots access my $top_use_cache_ROBOTS = 0; my $top_create_cache_ROBOTS = 0; my $top_cache_name_ROBOTS = ""; my $top_cache_file_ROBOTS = 0; # The list of all accessed file (file, [nbhits total, external, local, nerrors]) my %top_all_files_hash = (); my $top_total_files = 0; my $top_nb_pages_listed = 0; my $top_total_nb_pages_listed = 0; # The list of all accessed files sorted by nbhits total # (nbhits [file ...]) my %top_ordered_files_hash = (); my $top_nb_ordered = 0; # The array of all nb hits sorted from hight to low my @top_nbhits_sorted_tab; # The array of top nbhits and the indices # at which it had been extracted from the # whole list of sorted nbhits. my @top_nbhits_subtab; my $top_nbhits_beg_idx; my $top_nbhits_end_idx; # List of seach engine [name, nbhits], initialised with some know # one and dynamicaly filled with some other (those which read /robots.txt file). my %top_search_engines_hash = ( "scooter.pa-x.dec.com" => 1, # Altavista "tarantula.av.pa-x.dec.com" => 1, # Altavista "vscooter.av.pa-x.dec.com" => 1, # Altavista "crimpshrine.atext.com" => 1, # Excite "crawl2.atext.com" => 1, # Excite "beastie.atext.com" => 1, # Excite "france.ecila.com" => 1, # Ecilia "anodin.ecila.com" => 1, # Ecilia "josephus.netmind.com" => 1, # URL Minder "leftcoast.netmind.com" => 1, # URL Minder "galore-bbn.infoseek.com" => 1, # InfoSeek "backdraft-bbn.infoseek.com" => 1, # InfoSeek "southpaw-bbn.infoseek.com" => 1, # InfoSeek "wilbur-bbn.infoseek.com" => 1, # InfoSeek "j11.inktomi.com" => 1, # InKtomi "j12.inktomi.com" => 1, # InKtomi "lycosidae.lycos.com" => 1, # Lycos "spider1.srv.pgh.lycos.com" => 1, # Lycos "spider2.srv.pgh.lycos.com" => 1, # Lycos "spider3.srv.pgh.lycos.com" => 1, # Lycos "spider4.srv.pgh.lycos.com" => 1, # Lycos "spider5.srv.pgh.lycos.com" => 1, # Lycos "spider6.srv.pgh.lycos.com" => 1, # Lycos "spider7.srv.pgh.lycos.com" => 1, # Lycos "aspirateur.inria.fr" => 1, # Inria "tripod.webcrawler.com" => 1, # WebCrawler "deep.webcrawler.com" => 1, # WebCrawler "alcorak.x-echo.com" => 1, # X-Echo "actarus.x-echo.com" => 1, # X-Echo ); my $top_search_engines_nb = 0; # Guess what my $the_robotstxt_file = "/robots.txt"; # Some Dates my $top_logfile_creation_date = ""; my $top_date_now = ""; # The script's user my $top_unknown_user_address = "unknown-user-address"; my $top_user_address = $ENV{"REMOTE_ADDR"} || $top_unknown_user_address; # the method the script was called with my $top_meth = $ENV{'REQUEST_METHOD'} || "Unknown method"; my $top_meth_post = ($top_meth eq "POST"); my $dummy = 0; # output DBG stuff my $B = ""; my $U = ""; if ( -t ) { $B = "\033[1m"; $U = "\033[m"; } # Sub topMs2DateGMT ------------------------------------------------------------- sub topMs2DateGMT { # topMs2DateGMT( date_in_s_since_epoch ); # Used to set Cookies dates: # http://home.netscape.com/newsref/std/cookie_spec.html # This is based on RFC 822, RFC 850, RFC 1036, and RFC 1123, # with the variations that the only legal time zone is GMT # and the separators between the elements of the date must be dashes. my @gm_time = gmtime( $_[0] ); return (('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')[$gm_time[6]]) . ", " . $gm_time[3] . "-" . (('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Noc', 'Dec')[$gm_time[4]]) . "-" . # This is a year 2000 potential problem!!! (1900 + $gm_time[5]) . " " . $gm_time[2] . ":" . $gm_time[1] . ":" . $gm_time[0] . " GMT"; } # Sub topFooterHTML ------------------------------------------------------------- sub topFooterHTML { # topFooterHTML(); return if $top_silent; print << "EO_FOOTER";

$script_name v$script_version (c) 1996 - 1998 heddy Boubaker C.E.N.A
Just ask me or GO THERE if you want a Free copy of this script.
$HTML_END EO_FOOTER $top_currently_displaying_html = 0; } # Sub topWarn ------------------------------------------------------------- sub topWarn { # topWarn( message ); print STDERR "$B WARNING $U: $_[0]\n"; } # Sub topDBG ------------------------------------------------------------- sub topDBG { # topDBG( message ); return if ( !$top_called_from_cl ); print "$B DBG $U: $_[0]\n" unless $top_silent; } # Sub topError ------------------------------------------------------------- sub topError { # topError( message ); my ($message) = @_; if ( $top_called_from_cl ) { print STDERR "$B ERROR $U: $message\n"; exit( 0 ); } # called as cgi script outpurt HTML if ( $top_currently_displaying_html ) { # flush html print "\n"; } # send error page print << "EO_ERROR"; Content-type: text/html Status: 500 CGI Error $HTML_HEAD CGI error $HTML_BODY

Error 500 in CGI script $script_name v$script_version

Top homepage


$message


Could you please be kind enough to contact the webmaster of this site or the author of the script, thanks.
EO_ERROR topFooterHTML(); # send error to error_log file print STDERR "Error 500 in CGI: $script_name \"$message\"\n"; exit( 0 ); } # Sub topThanks ------------------------------------------------------------- sub topThanks { # topThanks(); return if $top_silent; if ( $top_called_from_cl ) { print "${B} Thanks ${U}, bye bye!\n"; exit 0; } # cgi script output HTML print << "EO_THANKS"; Content-type: text/html $HTML_HEAD Thanks $HTML_BODY

Bye, see you

Top homepage


Thanks a lot for using $script_name v$script_version on server ${WWW_SERVER_URL}/.

Hope you enjoyed it! And that it was useful for you.

EO_THANKS # Print The Footer topFooterHTML(); exit( 0 ); } # Sub topHelp ------------------------------------------------------------- sub topHelp { return if $top_silent; if ( $top_called_from_cl ) { # Command line stuff print << "EO_HELP"; top 'cgi arguments' ... see @ EO_HELP } else { # cgi script print << "EO_HELP"; Content-type: text/html $HTML_HEAD Top@$WWW_SERVER_ADDRESS Help $HTML_BODY

Top@$WWW_SERVER_ADDRESS Help

Top homepage


Detailled up to date help is located at
Top Homepage

EO_HELP # Print The Footer topFooterHTML(); } exit( 0 ); } # Sub topShowConfig ------------------------------------------------------------- sub topShowConfig { return if $top_silent; if ( $top_called_from_cl ) { # Command line stuff print << "EO_SHOWCONF"; To see top configuration call the script as a cgi-script with argument 'showconf', thanks. EO_SHOWCONF } else { # cgi script stuff print << "EO_SHOWCONF"; Content-type: text/html $HTML_HEAD Top@$WWW_SERVER_ADDRESS Configuration $HTML_BODY

Top@$WWW_SERVER_ADDRESS Configuration

Top homepage


$script_name v$script_version called as ${SCRIPT_URL}?$ENV{ "QUERY_STRING" } (with method $top_meth) from $top_user_address

Configuration variables values:

EO_SHOWCONF # Print The Footer topFooterHTML(); } exit( 0 ); } # Sub topNormalizeFilename ---------------------------------------------------- sub topNormalizeFilename { # topNormalizeFilename( $filename ); # Cleanup filename so that the many way # to access a page would have only one syntax. # ex: /mydir/index.html => /mydir/ = /mydir # ex: stuff.html#thing => stuff.html # ex: http://www.local.domain/stuff.html => stuff.html # ex: http://www/stuff.html => stuff.html #|3.3| my $filename = @_[0]; my $filename = $_[0]; # Remove http:// if this is a request # to the local server. If not the # httpd serv should be a proxy cache too # and the request is for another server, # in this case we process it in topProcessFile() my ( $the_protocol, $the_server, $the_file ) = $filename =~ /^(http|ftp|gopher):\/\/(\S+)\/(\S*)/io; if ( $the_protocol ) { # TODO: Compile the regexp as explained int the perl faq unless ( $the_server =~ /quotemeta( $LOCAL_DOMAIN ) . "\$"/i || $the_server =~ /"^" . quotemeta( $WWW_SERVER_NAME ) . "\$"/i || $the_server =~ /"^" . quotemeta( $WWW_SERVER_ADDRESS ) . "\$"/i ) { # filename is not an access to a local file, # this is a remote file server via the local # proxy server. return $filename; } # this is an access to a local file via proxy # remove all the server part and made the file # start from root "/" $filename = "/" . $the_file; } # remove #thing at end of urls, this # should be handled only by the browser # but it seems to have exceptions ... or bugs $filename =~ s/\#[^\/]*\$//o; # see $FILENAME_NORMALIZE_REGEXP declaration $filename =~ s/$FILENAME_NORMALIZE_REGEXP//; # cleanup the ~user stuff $filename =~ s/\/\%7e/\/~/io; # the root page should have / return $filename?$filename:"/"; } # Sub topIsLocalCaller ---------------------------------------------------- sub topIsLocalCaller { # topIsLocalCaller( hostname ); # Check if a hostname is from local domain or not my ($the_hostname) = @_; if ( $the_hostname eq $top_unknown_user_address ) { # we could not get the caller's address # for security reasons asume it is not # a local address. if ( $top_called_from_cl ) { return 1; } else { return 0; } } elsif ( $the_hostname =~ /$IS_LOCAL_REGEXP/i ) { # the address match the local address regexp return 1; } else { # the default is that address is not local return 0; } } # Sub topFilterOutFile --------------------------------------------------------- sub topFilterOutFile { # topFilterOutFile( $filename, $caller_addr ); # Filter out a file with regexp or if caller # is a search engine (spider). my ($file_to_filter, $the_caller) = @_; if ( $top_arg_nofilter ) { return 0; } # filter with regexp # TODO: Compile the regexp list as explained in the perl faq my $the_filter; foreach $the_filter ( @FILTERS_LIST ) { if ( $file_to_filter =~ /$the_filter/i ) { return 1; } } # filter if accessor is a search engine my $nb_hits; if ( exists $top_search_engines_hash{ $the_caller }) { $nb_hits = $top_search_engines_hash{ $the_caller }; $top_search_engines_hash{ $the_caller } = ++$nb_hits; return 1; } return 0; } # Sub topProcessFile ------------------------------------------------------- sub topProcessFile { # topProcessFile( file, caller, status, nbytes ); # Add the file to the all files list after # filtering it. my ($the_file, $the_caller, $the_status, $the_bytes ) = @_; my @nb_hits; my $the_server; my $file_error = 0; # If this is an access to a file of kind: http://serv/file # there is chance the httpd we're running on is a proxy cache # server too. So this is not an access to a local file and we # should not process it! if ( $the_file =~ /^(http|ftp|gopher):\/\//io ) { # But if this is a request to the # local server the file had allready # been processed in topNormalizeFilename return; } # remove file according to filters unless ( $top_arg_nofilter ) { return if topFilterOutFile( $the_file, $the_caller ); } # Check any errors if ( # errors code from http://www.w3.org/pub/WWW/Protocols/HTTP/HTRESP.html $the_status == 501 || # not implemented $the_status == 500 || # server error $the_status == 404 || # file not found $the_status == 403 || # permission denied $the_status == 401 || # authentification failed $the_status == 400 # bad request ) { $file_error = 1; } # Check file size if ( $the_bytes eq '-' ) { # server do not log file size informations $the_bytes = 0; $top_logs_have_sizes = 0; } # Update file informations if ( exists $top_all_files_hash{ $the_file } ) { # allready in top_all_files_hash inc hits @nb_hits = @{ $top_all_files_hash{ $the_file }}; # update total hits $nb_hits[0] = ++$nb_hits[0]; # update local or extern nb hits if ( topIsLocalCaller( $the_caller )) { $nb_hits[2] = ++$nb_hits[2]; } else { $nb_hits[1] = ++$nb_hits[1]; } # update file errors $nb_hits[3] += $file_error; # update file size if ( $the_bytes > $nb_hits[4] ) { $nb_hits[4] = $the_bytes; } # update total size $nb_hits[5] += $the_bytes; # updates ... done $top_all_files_hash{ $the_file } = [ @nb_hits ]; } else { # The file is not yet into the list add it $top_total_files++; if ( topIsLocalCaller( $the_caller )) { $top_all_files_hash{ $the_file } = [1, 0, 1, $file_error, $the_bytes, $the_bytes]; } else { $top_all_files_hash{ $the_file } = [1, 1, 0, $file_error, $the_bytes, $the_bytes]; } } } # Sub topPrintFile ------------------------------------------------------ sub topPrintFile { # topPrintFile( $file, hits ) # Print infos about the file in html return if $top_silent; my ($the_file, $total_hits) = @_; my @hits = @{ $top_all_files_hash{ $the_file }}; my $file_size = $hits[4]; my $total_size = $hits[5]; my $local_hits = $hits[2]; my $local_percent = ($local_hits * 100) / $total_hits; my $local_hs = "#" x int ($local_percent / 2); my $ext_hits = $hits[1]; my $ext_percent = ($ext_hits * 100) / $total_hits; my $ext_hs = "#" x int( $ext_percent / 2 ); my $errors = $hits[3]; my $errors_percent = ($errors * 100) / $total_hits; my $errors_inf = ""; my $size_inf = ""; my $total_size_inf = ""; my $plural = $total_hits>1?"s":""; my $filelog_b = ""; my $filelog_e = ""; # Do we have a per file logs cgi-script? if ( $FILELOG_CGI ) { $filelog_b = ""; $filelog_e = ""; } # Are we allowed and able to print sizes? if ( $top_print_allstuff && $top_logs_have_sizes ) { $size_inf = "- [${file_size} bytes] -"; $total_size_inf = "- [${total_size} bytes transfered] -" } # Are we allowed to print errors if ( $PARANO_MODE < 2 && $errors ) { $errors_inf = sprintf " [$errors errors, %3.1f%% => %d real hits]", $errors_percent, $total_hits - $errors; } else { $errors_inf = ""; } # Ok do the print... print << "EO_FILE";
${the_file}:  ${size_inf} ${filelog_b}${total_hits} hit${plural}${filelog_e} ${total_size_inf} ${errors_inf}
EO_FILE # Print more infos if allowed to if ( $top_print_allstuff ) { if ( $local_hits ) { $local_percent = sprintf "%3.1f", $local_percent; if ( $ext_hits ) { print " Locals:  $local_hs ${local_percent}% [${local_hits}]
\n"; } else { print " Locals:  ${local_percent}%\n"; } } if ( $ext_hits ) { $ext_percent = sprintf "%3.1f", $ext_percent; if ( $local_hits ) { print " Extern:  $ext_hs ${ext_percent}% [${ext_hits}]
\n" } else { print " Extern:  ${ext_percent}%
\n" } } } $top_nb_pages_listed++; $top_total_nb_pages_listed++; } # Sub topParseCgiArgs ---------------------------------------------------- sub topParseCgiArgs { # Parse arguments of cgi script my $argsstr = "Args = \""; # split arg1&arg2&arg3 -> [arg1, arg2, arg3] my @args = split /&/, $_[0]; if ( $#args == 0 && $args[0] =~ /^\d+$/o ) { # only one arg which is the top # number $top_arg_nth = $args[0]; topDBG "${argsstr}${top_arg_nth}\""; return; } elsif ( $#args == 0 && $args[0] eq 'max' ) { # only one arg which is 'max' meaning max of top # numbers $top_arg_nth = $TOP_MAX; topDBG "${argsstr}max ($top_arg_nth)\""; return; } my @arg; ARG: foreach ( @args ) { # split arg=value -> [arg, value] @arg = split(/=/, $_ ); if ( $#arg == 0 ) { topDBG "Arg: $arg[0]"; # ags alones if ( $arg[0] eq 'showconf' ) { # Print top config $top_arg_showconf = 1; $argsstr .= "&showconf"; next ARG; } if ( $arg[0] eq 'help' ) { # Print top help $top_arg_help = 1; $argsstr .= "&help"; next ARG; } if ( $arg[0] eq 'nc' || $arg[0] eq 'nocache' ) { # Do not use thecache $top_arg_nocache = 1; $argsstr .= "&nocache"; next ARG; } if ( $arg[0] eq 'rc' || $arg[0] eq 'recache' ) { # Force rebuild the cache $top_create_cache_GLOBAL = 1; $top_arg_recache = 1; $argsstr .= "&recache"; next ARG; } if ( $arg[0] eq 'nf' || $arg[0] eq 'nofilter' ) { # Do not use the filters list $top_arg_nofilter = 1; $argsstr .= "&nofileter"; next ARG; } if ( $arg[0] =~ /^\d+$/ ) { # the top # number $top_arg_nth = $arg[0]; $argsstr .= "&$top_arg_nth"; next ARG; } if ( $arg[0] eq 'max' ) { # max of top # numbers $top_arg_nth = $TOP_MAX; $argsstr .= "&max"; next ARG; } if ( $arg[0] eq 'enough' ) { # end of use of the script, used with 'id' # to remove the cache. # only the script should use this one $top_arg_enough = 1; $argsstr .= "&enough"; next ARG; } if ( $top_called_from_cl && $arg[0] eq 'silent' ) { # Shut up top when calling it from cl $top_silent = 1; $argsstr .= "&silent"; next ARG; } next ARG; } else { # args with values my $value = $arg[1]; # cleanup value # Convert %XX from hex numbers to alphanumeric, but not escaped % (\%) $value =~ s/([^\\])%([A-Fa-f0-9]{2})/sprintf( "%s%s", $1, pack( "c", hex( $2 )))/ge; # Convert + to ` `, but not escaped + (\+) $value =~ s/([^\\])\+/$1 /g; topDBG "Arg: $arg[0]=\"$value\""; if ( $arg[0] eq 'nc' || $arg[0] eq 'nocache' ) { if ( $value eq '0' || $value eq 'off' ) { # DO NOT Do not use thecache $top_arg_nocache = 0; $argsstr .= "&nocache=off"; } else { # Do not use thecache $top_arg_nocache = 1; $argsstr .= "&nocache=on"; } next ARG; } if ( $arg[0] eq 'rc' || $arg[0] eq 'recache' ) { if ( $value eq '0' || $value eq 'off' ) { # DO NOT Force rebuild the cache $top_create_cache_GLOBAL = 0; $top_arg_recache = 1; $argsstr .= "&recache=off"; } else { # Force rebuild the cache $top_create_cache_GLOBAL = 1; $top_arg_recache = 1; $argsstr .= "&recache=on"; } next ARG; } if ( $arg[0] eq 'nf' || $arg[0] eq 'nofilter' ) { if ( $value eq '0' || $value eq 'off' ) { # DO NOT Do not use the filters list $top_arg_nofilter = 0; $argsstr .= "&nofilter=off"; } else { # Do not use the filters list $top_arg_nofilter = 1; $argsstr .= "&nofilter=on"; } next ARG; } if ( $arg[0] eq 'enough' ) { # end of use of the script, used with 'id' # to remove the cache. # only the script should use this one if ( $value eq '0' || $value eq 'off' ) { $top_arg_enough = 0; $argsstr .= "&enought=off"; } else { $top_arg_enough = 1; $argsstr .= "&enought=on"; } next ARG; } if ( $arg[0] eq 'help' ) { if ( $value eq '0' || $value eq 'off' ) { $top_arg_help = 0; $argsstr .= "&help=off"; } else { $top_arg_help = 1; $argsstr .= "&help=on"; } next ARG; } if ( $arg[0] eq 'n' || $arg[0] eq 'num' ) { # the top # number if ( $value eq 'max' ) { $top_arg_nth = $TOP_MAX; $argsstr .= "&num=max"; } elsif( $value =~ /^\d+$/o ) { # protection against non num args $top_arg_nth = $value; $argsstr .= "&num=$value"; } elsif( $value eq '' ) { $top_arg_nth = $TOP_NTH_DEFAULT; $argsstr .= "&num="; } else { # shoud be a number topError "Arg of num `${value}' should be a number or `max'"; } next ARG; } if ( $arg[0] eq 's' || $arg[0] eq 'skip' ) { # number of first entries to skip if( $value =~ /^\d+$/o ) { # protection against non num args $top_arg_skip = $value; $argsstr .= "&skip=$value"; } elsif( $value eq '' ) { $top_arg_skip = 0; $argsstr .= "&skip="; } else { # shoud be a number topError "Arg of skip `${value}' should be a number"; } next ARG; } if (($arg[0] eq 'i' || $arg[0] eq 'id') && $top_user_id == 0 ) { # unique user id, used to keep trace for cache # only the script should use this one $top_user_id = $value; $argsstr .= "&id=$value"; next ARG; } if ($arg[0] eq 're' || $arg[0] eq 'regexp') { # Filter by regexp $top_arg_regexp_is_set = 1; # clean up the regexp $top_arg_regexp_value = $value; $argsstr .= "®exp=\'$value\'"; next ARG; } if ( $arg[0] eq 'al' ) { # total of pages allready listed # only the script should use this one if( $value =~ /^\d+$/o ) { # protection against non num args $top_total_nb_pages_listed = $value; $argsstr .= "&al=$value"; } else { # shoud be a number topError "Arg of al `${value}' should be a number"; } next ARG; } } } topDBG $argsstr . "\""; } # Sub topGetLogsDates ---------------------------------------------------------- sub topGetLogsDates { # get the log file creation date if ( $top_use_cache_DATES ) { topDBG "Using cache for dates ..."; flock( $top_cache_file_DATES, $LOCK_EX ); # first line log file creation date $top_logfile_creation_date = <$top_cache_file_DATES>; chomp $top_logfile_creation_date; # 2nd line last date of cached infos from log file $top_date_now = <$top_cache_file_DATES>; chomp $top_date_now; close( $top_cache_file_DATES ); flock( $top_cache_file_DATES, $LOCK_UN ); topDBG "Using cache for dates ... done"; } else { topDBG "NOT Using cache for dates ..."; my $fh = new FileHandle "head -1 $TRANSFER_LOG |"; if ( !defined $fh ) { topError "Can't get head of TRANSFER_LOG: $TRANSFER_LOG"; } # get only the first line my $line = $fh->getline; $fh->close; chomp $line; topDBG "1st $TRANSFER_LOG line for date = \"$line\""; # extract the first date ($top_logfile_creation_date) = $line =~ /$TRANSFER_LOG_GETDATE_REGEXP/; chomp $top_logfile_creation_date; if ( !(defined $top_logfile_creation_date) || !$top_logfile_creation_date ) { # Could not get the date from the contents # of the file try to get date of creation... # but in fact that do not work correctly as the stat[10] # is the ctime of the file which do not correcpond of # the creation date of the file, and under Unix # there is NO way to get the creation time of a file.! topDBG "\"$line\" Do not match \"$TRANSFER_LOG_GETDATE_REGEXP\" (\"$top_logfile_creation_date\") stating $TRANSFER_LOG for creation date"; $top_logfile_creation_date = topMs2DateGMT(( stat( $TRANSFER_LOG ))[10] ); } # get the date of now $top_date_now = topMs2DateGMT( time ); if ( $top_create_cache_DATES ) { flock( $top_cache_file_DATES, $LOCK_EX ); print $top_cache_file_DATES $top_logfile_creation_date, "\n"; print $top_cache_file_DATES $top_date_now, "\n"; close( $top_cache_file_DATES ); flock( $top_cache_file_DATES, $LOCK_UN ); } topDBG "NOT Using cache for dates ... done"; } topDBG "Dates: from ${B} $top_logfile_creation_date ${U} to ${B} $top_date_now ${U}"; } # Sub topGetAllRobots -------------------------------------------------------- sub topGetAllRobots { # Search for www search engines in log file # by grepping for robots.txt access. my $filelog_b = ""; my $filelog_e = ""; my $line = ""; my $caller = ""; my $file = ""; my $hits = 0; if ( !$top_silent ) { if ( $FILELOG_CGI ) { $filelog_b = ""; $filelog_e = ""; } print "
\nList of the known WWW spiders (+those who access ${filelog_b}$the_robotstxt_file${filelog_e}) which will be removed from stats:\n

\n"; } # get the ROBOTS list if ( $top_use_cache_ROBOTS ) { topDBG "Using CACHE for robots ..."; flock( $top_cache_file_ROBOTS, $LOCK_EX ); while ( $line = <$top_cache_file_ROBOTS> ) { chomp $line; ($caller, $hits) = split(/ /, $line); $top_search_engines_hash{ $caller } = $hits; print "${caller}, " unless $top_silent; } flock( $top_cache_file_ROBOTS, $LOCK_UN ); close $top_cache_file_ROBOTS; topDBG "Using CACHE for robots ... done "; } else { topDBG "No CACHE for robots \"$FAST_GREP_PG '$the_robotstxt_file' $TRANSFER_LOG |\" ..."; my $fh = new FileHandle "$FAST_GREP_PG '$the_robotstxt_file' $TRANSFER_LOG |"; if ( !defined $fh ) { topError "Can't grep into TRANSFER_LOG: $TRANSFER_LOG"; } while ( $line = $fh->getline ) { chomp $line; # split access log fields ( $caller, undef, undef, undef, $file, undef, undef) = $line =~ /$TRANSFER_LOG_FORMAT_REGEXP/; # check if file is the robots.txt accessed only by # www search engines if ( $file eq $the_robotstxt_file ) { if ( exists $top_search_engines_hash{ $caller } ) { # Inc the nb of hits for this robot $hits = $top_search_engines_hash{ $caller }; if ( $hits == 1 ) { $top_search_engines_nb++; }; $top_search_engines_hash{ $caller } = ++$hits; } else { # Do not make a local caller a search engine unless ( topIsLocalCaller( $caller )) { # Add this new robot $top_search_engines_nb++; $top_search_engines_hash{ $caller } = 1; } }; }; } $fh->close; flock( $top_cache_file_ROBOTS, $LOCK_EX ); my $se = ""; while (($se, $hits) = each %top_search_engines_hash ) { print "${se}, " unless $top_silent; print $top_cache_file_ROBOTS "$se $hits\n" if $top_create_cache_ROBOTS; } flock( $top_cache_file_ROBOTS, $LOCK_UN ); close $top_cache_file_ROBOTS; } print "\n

\n
\n" unless $top_silent; topDBG "All robots got"; } # Sub topCheckCache ------------------------------------------------------- sub topCheckCache { # Check all caches files my $id = $_[0]; my $mode = 0750; # check cache dir unless ( -d $CACHE_DIR ) { #Create the cache dir unless ( mkdir( $CACHE_DIR, $mode )) { topError "Can't create cache dir $CACHE_DIR: $!"; return; } } # create caches filenames $top_cache_name_GLOBAL = $CACHE_DIR . "/CACHE"; $top_cache_name_GLOBAL .= "-nofilter" if ( $top_arg_nofilter ); $top_cache_name_DATES = $CACHE_DIR . "/DATES"; $top_cache_name_ROBOTS = $CACHE_DIR . "/ROBOTS"; $top_cache_name_MAIN = $CACHE_DIR . "/" . $id . "-CACHE"; # check cache file GLOBAL if ( !-e $top_cache_name_GLOBAL || -M $top_cache_name_GLOBAL > $CACHE_REFRESH_PERIOD || # set by recache arg $top_create_cache_GLOBAL ) { # need to be (re)built $top_create_cache_GLOBAL = 1; # delete all related files unlink $top_cache_name_GLOBAL; unless ( $top_arg_nofilter ) { unlink $top_cache_name_DATES; unlink $top_cache_name_ROBOTS; } } # check cache file DATES if ( -r $top_cache_name_DATES ) { # open the cache file if ( open( CACHE_FILE_DATES, $top_cache_name_DATES )) { $top_cache_file_DATES = CACHE_FILE_DATES; $top_use_cache_DATES = 1; } else { topWarn "Can't open cache file $top_cache_name_DATES: $!"; } } else { # Create the cache file if ( open( CACHE_FILE_DATES, ">" . $top_cache_name_DATES )) { $top_cache_file_DATES = CACHE_FILE_DATES; $top_create_cache_DATES = 1; } else { topWarn "Can't create cache file $top_cache_name_DATES: $!"; } } # check cache file ROBOTS if ( -r $top_cache_name_ROBOTS ) { # open the cache file if ( open( CACHE_FILE_ROBOTS, $top_cache_name_ROBOTS )) { $top_cache_file_ROBOTS = CACHE_FILE_ROBOTS; $top_use_cache_ROBOTS = 1; } else { topWarn "Can't open cache file $top_cache_name_ROBOTS: $!"; } } else { # Create the cache file if ( open( CACHE_FILE_ROBOTS, ">" . $top_cache_name_ROBOTS )) { $top_cache_file_ROBOTS = CACHE_FILE_ROBOTS; $top_create_cache_ROBOTS = 1; } else { topWarn "Can't create cache file $top_cache_name_ROBOTS: $!"; } } # check cache file MAIN if ((-M $top_cache_name_MAIN ) < (-M $top_cache_name_GLOBAL )) { # main cache file is older than global unlink $top_cache_name_MAIN; } if ( -r $top_cache_name_MAIN ) { # open the cache file if ( open( CACHE_FILE_MAIN, $top_cache_name_MAIN )) { $top_cache_file_MAIN = CACHE_FILE_MAIN; $top_use_cache_MAIN = 1; } else { topWarn "Can't open cache file $top_cache_name_MAIN: $!"; } } else { unless ( $top_create_cache_GLOBAL ) { # Create the cache file, this is a hard link # to the global cache so that others scripts # could work on the global (delete, ...) # without any problem for here. if ( link( $top_cache_name_GLOBAL, $top_cache_name_MAIN ) && open( CACHE_FILE_MAIN, $top_cache_name_MAIN )) { $top_cache_file_MAIN = CACHE_FILE_MAIN; $top_use_cache_MAIN = 1; } else { topWarn "Can't create cache file $top_cache_name_MAIN: $!"; } } } } # Sub topCleanup -------------------------------------------------------- sub topCleanup { # delete cache file topDBG "CleanUp $top_cache_name_MAIN ..."; if ( -e $top_cache_name_MAIN ) { unlink $top_cache_name_MAIN; } topDBG "CleanUp $top_cache_name_MAIN ... done"; } # Sub topCleanupCache ------------------------------------------------ sub topCleanupCache { my $cache_file = ""; topDBG "CleanUp CACHE ..."; while ( $cache_file = glob( $CACHE_DIR . "/*-CACHE" )) { topDBG "$cache_file Cleaning maybe ..."; if ( -A $cache_file > $CACHE_REFRESH_PERIOD ) { topDBG "$cache_file Cleaning maybe ... YES"; unlink $cache_file; } } topDBG "CleanUp CACHE ... done"; } # Sub topGetFileList ------------------------------------------------------ sub topGetFileList { #### Open the whole log file and store infos ### my $line = ""; my $file = ""; my $total_hits = 0; my $extern_hits = 0; my $local_hits = 0; my $errors = 0; my $file_size = 0; my $total_size = 0; if ( $top_use_cache_MAIN ) { topDBG "Using CACHE ..."; # read infos from cache, the cache # should be ordered by nb hits in # descending order. foreach $line ( <$top_cache_file_MAIN> ) { ($file, $total_hits, $extern_hits, $local_hits, $errors, $file_size, $total_size) = split ' ', $line; # fill the hash tables $top_all_files_hash{ $file } = [ $total_hits, $extern_hits, $local_hits, $errors, $file_size, $total_size]; if ( exists $top_ordered_files_hash{ $total_hits }) { # allready get a file with the same nbhits push @{ $top_ordered_files_hash{ $total_hits }}, $file; } else { # a new nb hits $top_ordered_files_hash{ $total_hits } = [ $file ]; push @top_nbhits_sorted_tab, $total_hits; } } close $top_cache_file_MAIN; topDBG "Using CACHE ... done"; } else { topDBG "NOT Using CACHE ..."; # no cache do all the work. my $fh = new FileHandle $TRANSFER_LOG, "r"; if ( !defined $fh ) { topError "Can't open TRANSFER_LOG file: $TRANSFER_LOG"; } # process it my $caller; my $status; my $file; my $bytes = 0; my $n1 = 0; my $n2 = 0; while ( $line = $fh->getline ) { $n1++; chomp $line; # split access log fields ( $caller, undef, undef, undef, $file, $status, $bytes ) = $line =~ /$TRANSFER_LOG_FORMAT_REGEXP/o; if ( $file && $caller && $status ) { # Normalize file name $file = topNormalizeFilename( $file ); # Add the file to file list topProcessFile( $file, $caller, $status, $bytes ); $n2++; } } $fh->close; topDBG "$n1 lines read, $n2 files processed"; # Build a hash table whose keys are nb hits foreach $file ( keys %top_all_files_hash ) { my $total_hits = @{ $top_all_files_hash{ $file }}[0]; if ( exists $top_ordered_files_hash{ $total_hits }) { push @{ $top_ordered_files_hash{ $total_hits }}, $file; } else { $top_ordered_files_hash{ $total_hits } = [ $file ]; } } # sort by keys @top_nbhits_sorted_tab = sort { $b <=> $a } keys %top_ordered_files_hash; topDBG "NOT Using CACHE ... done"; } $top_nb_ordered = $#top_nbhits_sorted_tab +1; # extract only the number we need: these are the return values $top_nbhits_beg_idx = $top_arg_skip>$#top_nbhits_sorted_tab?$#top_nbhits_sorted_tab:$top_arg_skip; $top_nbhits_end_idx = $top_high_range-1>$#top_nbhits_sorted_tab?$#top_nbhits_sorted_tab:$top_high_range -1; @top_nbhits_subtab = @top_nbhits_sorted_tab[$top_nbhits_beg_idx .. $top_nbhits_end_idx]; topDBG "... $TRANSFER_LOG processed"; } # Main Program. main() ---------------------------------------------------- ### Sub topMain() ### # topMain( ); ### sub topMain ( ) { } # end topMain(); # Autoflush stdout $| = 1; # Parse script arguments my $top_arguments = $ENV{ "QUERY_STRING" }; if ( $top_arguments || $ENV{"REMOTE_ADDR"}) { # called as a cgi script $top_called_from_cl = 0; topParseCgiArgs( $top_arguments ); } elsif ( $ARGV[0] ) { # called from command line for tests $top_called_from_cl = 1; topParseCgiArgs( $ARGV[0] ); } else { # should be called from command line topDBG "No arguments"; $top_called_from_cl = 1; } # Try to get cookie $top_cookie_is_ok = 0; $top_cookie_value = 0; if ( $ENV{'HTTP_COOKIE'} ) { my %CookieJar = split('[;=] *', $ENV{'HTTP_COOKIE'}); if (( $top_cookie_value = $CookieJar{ $top_cookie_name }) > 0 ) { $top_user_id = $top_cookie_value; $top_cookie_is_ok = 1; } } # Check permissions for user $top_local_user = topIsLocalCaller( $top_user_address ); if ( $PARANO_MODE && !$top_local_user ) { $top_args_allowed = 0 if ( $PARANO_MODE >0 ); $top_print_allstuff = 0 if ( $PARANO_MODE >1 ); } # args check: That is check good args usage if ( $top_arg_enough == 1 && $top_user_id == 0 ) { # who have enough??? topError ( "\`enough\' arg called without user id" ); } if ( !$top_local_user ) { # remove args that are always forbiden to remote users $top_arg_showconf = 0; } if ( $top_arg_nth > $TOP_MAX ) { $top_arg_nth = $TOP_MAX; } if ( $top_arg_nth < 1 ) { $top_arg_nth = 1; } if ( $top_arg_skip < 0 ) { $top_arg_skip = 0; } if ( $top_arg_skip == 0 ) { $top_total_nb_pages_listed = 0; } if ( $top_user_id == 0 ) { $top_user_id = $$.$^T; } $top_low_range = $top_arg_skip + 1; $top_high_range = $top_arg_nth + $top_arg_skip; # install handlers $SIG{__DIE__} = sub { topCleanup(); exit -1; }; $SIG{INT} = sub { topCleanup(); exit -2; }; $SIG{QUIT} = sub { topCleanup(); exit -3; }; # Here are some arguments which display # some information and exit the proggram. if ( $top_arg_help ) { topHelp(); # exit here } elsif ( $top_arg_showconf ) { topShowConfig(); # exit here } # check allowed args, # depends of the parano mode set and # of where the caller is from. my $args_str = ""; ##$top_args_allowed = 1; if ( !$top_args_allowed && ( $top_arg_nofilter || $top_arg_nocache || $top_create_cache_GLOBAL || $top_arg_regexp_is_set )) { $args_str = "You (@ $top_user_address) are not allowed to use the arg(s): " . ($top_arg_nofilter?"nofilter":"") . ($top_arg_nocache?"nocache":"") . ($top_create_cache_GLOBAL?"recache":"") . ($top_arg_regexp_is_set?"regexp":"") . ""; $top_arg_nofilter = 0; $top_arg_nocache = 0; $top_arg_recache = 0; $top_create_cache_GLOBAL = 0; $top_arg_regexp_value = ""; $top_arg_regexp_is_set = 0; } # check the cache for the given id unless ( $top_arg_nocache ) { topCheckCache( $top_user_id ); if ( $top_arg_enough ) { topCleanup(); topThanks(); # exit here } } # Everything checked now start the job... # Get the creation date of the log file topGetLogsDates(); # Print header my $range_str = $top_arg_skip? ($top_arg_nth>1?"
\n#$top_low_range -> #$top_high_range":"
\n#$top_low_range"):""; $top_currently_displaying_html = 1; # 2 '\n' !!!! print "Content-type: text/html" unless $top_silent; # force flush now print "\n"; if ( !$top_cookie_is_ok ) { # Cookie not get, set it $top_cookie_value = $top_user_id; print "Set-Cookie: $top_cookie_name=$top_cookie_value; expires=$top_cookie_expiration_date; path=$SCRIPT_PATH; domain=$LOCAL_DOMAIN;\n" unless $top_silent; } # Prepare some infos about arguments used my $fr_inf = ""; $fr_inf .= "
Results filtered with regexp \"$top_arg_regexp_value\"" if ( $top_arg_regexp_is_set ); $fr_inf .= "
No Filter" if ( $top_arg_nofilter ); $fr_inf .= "
No Cache" if ( $top_arg_nocache ); print << "EO_HEADER" unless $top_silent; $HTML_HEAD Top$top_arg_nth\@${WWW_SERVER_ADDRESS}${WWW_SERVER_PORT} $HTML_BODY

Top$top_arg_nth accessed files on server ${WWW_SERVER_URL}/${range_str}

From $top_logfile_creation_date to $top_date_now

$args_str $fr_inf

Top homepage


EO_HEADER # Get & Print infos about spiders unless ( $top_arg_nofilter ) { topGetAllRobots(); } # Print infos about predicted efficiency my $infos_eff = ""; if ( $top_use_cache_MAIN ) { $infos_eff = "Please wait a little"; } else { my $file_size = (stat( $TRANSFER_LOG))[7]; $infos_eff = "$file_size bytes to process. You'll have to wait..."; } # Id user with cookie or `by hand' ? my $top_id_cgi_arg = ""; if ( !$top_cookie_is_ok ) { $top_id_cgi_arg = "&id=$top_user_id"; } # Misc informations print << "EO_STUFF" unless $top_silent; [Id: ${top_user_id}]  ${infos_eff}  [Enough] ...


EO_STUFF # get the file list, sort it and store result # in: @top_nbhits_subtab = the sorted array of hist, # %top_ordered_files_hash = an hash of arrays of filesname # whose keys are nb hits. topGetFileList(); # Print the file list my $file_rank = $top_arg_skip; my $hits = 0; foreach $hits ( @top_nbhits_subtab ) { my @files = @{ $top_ordered_files_hash{ $hits }}; my $file = ""; my @file_list; my $i = 0; my $hinf = ""; if ( $hits == 1 ) { $hinf = " hit"; } else { $hinf = " hits"; } $file_rank++; if ( $top_arg_regexp_is_set ) { # filter out file not matching the regexp foreach $file ( @files ) { if ( $file =~ /$top_arg_regexp_value/ ) { $file_list[$i] = $file; $i++; } } } else { @file_list = @files; } if ( $#file_list > -1 ) { # we've got some files print "- #$file_rank", $file_rank==1?" <Winner>":"", " - $hits $hinf" unless $top_silent; if ( $#files > 0 ) { my $file; print "  [", $#files +1, " ex-aequo]\n

\n" unless $top_silent; foreach $file ( @file_list ) { topPrintFile( $file, $hits ) unless $top_silent; } } else { print "\n
" unless $top_silent; topPrintFile( $file_list[0], $hits ) unless $top_silent; } print "

\n" unless $top_silent; } else { # we've got nothing !!! if ( $top_arg_regexp_is_set ) { print "- #$file_rank - $hits $hinf. ", $#files +1, " item(s), none match for regexp \"$top_arg_regexp_value\"
\n" unless $top_silent; } else { print "- #$file_rank - $hits $hinf. ", $#files +1, " item(s)
\n" unless $top_silent; } } } # print infos about nb printed files and time my $diff_time = time() - $top_start_time; my $time_infos = $diff_time>1?"$diff_time seconds":"less than 1 second"; my $page_num_infos = $top_nb_pages_listed>1?"$top_nb_pages_listed pages":"one page"; print "\n\n

\n$page_num_infos in $time_infos
\n" unless $top_silent; # Ask user if he want to get some more if ( $top_nbhits_end_idx < $top_nb_ordered -1 ) { my $low_range = $top_high_range+1; my $hight_range = $top_arg_nth+$top_high_range; my $range_str = $low_range<$hight_range?"#$low_range -> #$hight_range":"#$low_range"; my $extra_args = ($top_arg_nofilter?"&nofilter":"") . ($top_arg_nocache?"&nocache":"") . ($top_arg_regexp_is_set?"&re=$top_arg_regexp_value":""); print "[Click here to get some more: $range_str] [Enough]\n" unless $top_silent; } else { my $page_num_infos = $top_total_nb_pages_listed>1?"Total: $top_total_nb_pages_listed pages":"Only one page total"; my $fr_infos = ""; if ( $top_arg_regexp_is_set ) { $fr_infos = "[Matching regexp `$top_arg_regexp_value']"; } print "You've got them all now
Grand Total: $top_total_nb_pages_listed pages $fr_infos
\n" unless $top_silent; # We've no more to display clean things topCleanup(); } # Print The Footer & cleanup topFooterHTML(); # When everything had been printed we can rebuild the global cache if ( $top_create_cache_GLOBAL ) { topDBG "Rebuilding global cache ..."; unlink( $top_cache_name_GLOBAL ); my $fh = new FileHandle "> " . $top_cache_name_GLOBAL; if ( !defined $fh ) { topError "Can't create GLOBAL cache file $top_cache_name_GLOBAL: $!"; } # lock flock( $fh, $LOCK_EX ); # write infos foreach $hits ( @top_nbhits_sorted_tab ) { # get ordered files my @files = @{ $top_ordered_files_hash{ $hits }}; my $file; foreach $file ( @files ) { # extract nb hits infos my @vals = @{ $top_all_files_hash{ $file }}; my $val; print $fh $file; foreach $val ( @vals ) { print $fh " ", $val; } print $fh "\n"; } } # unlock flock( $fh, $LOCK_UN ); $fh->close; topDBG "Rebuilding global cache ... done"; } # Now we could have time to clean unusued caches topCleanupCache(); topDBG "Bye!"; exit(0); # top ends here ---------------------------------------------------------