# ------------------------------------------------------------------------
# HtDig.pm: Perl Module for interface to Ht:/Dig databases & stuff.
# Ht:/Dig is a free Intranet search engine http://www.htdig.org/
#
# Version: 0.2b
# Author: heddy Boubaker <boubaker@cenatoulouse.dgac.fr>
# Revisions:
#  05 Feb 1999 BirthDate
#  25 Feb 1999 Integrating w/ 1.3.0 & Zlib + more OO stuff
#
# Description:
#  This perl module is an interface to some of the Ht://Dig search engine
#  internals, it,  actually, is able to read and parse the Ht://Dig config 
#  files, read records from all Ht://Dig generated databases. 
#  
# TODO:
#  > Too Many things ...;-)
#  > The common_url_parts stuff is not yet implemented 
#    so if you didn't EXPLICITELY set common_url_parts to ''
#    in your config files your docdb database should be 
#    inexploitable!!
#  > Only reading the databases is possible now should add writting
#    and modifying them too.
# ------------------------------------------------------------------------

#use diagnostics -verbose;
use strict; 
#no strict 'vars';
require 5.00502;

use Carp;
use FileHandle;



# ------------------------------------------------------------------------
# HtDig: Interface to htdig stuff
# ------------------------------------------------------------------------

package HtDig;
use vars qw(@ISA @EXPORT_OK @EXPORT $DBG $VERSION);
require Exporter;
@HtDig::ISA = qw(Exporter);
@HtDig::EXPORT_OK = qw(config doc_db word_db verbosity);

# Configurable vars, see HtDig installation docs: file htdig-x.x.x/CONFIG
# All there varibales are stored too in the htdig->config

use constant PREFIX              => '/www/Tools/Htdig';          #@prefix@
use constant EXEC_PREFIX         => '/www/Tools/Htdig';          #@exec_prefix@
use constant DEST                => PREFIX;                      #${prefix}
use constant BIN_DIR             => EXEC_PREFIX . '/bin';        #${exec_prefix}/bin
use constant CONFIG_DIR          => DEST . '/conf';              #${DEST}/conf
use constant COMMON_DIR          => DEST . '/common';            #${DEST}/common
use constant DATABASE_DIR        => DEST . '/db';                #${DEST}/db
use constant DEFAULT_CONFIG_FILE => CONFIG_DIR . '/htdig.conf';  #${CONFIG_DIR}/htdig.conf
use constant CGIBIN_DIR          => '/www/Tools/Htdig/cgi-bin';  #@CGIBIN_DIR@
use constant IMAGE_DIR           => '/www/Documents/images';     #@IMAGE_DIR@
use constant IMAGE_URL_PREFIX    => '/images';                   #@IMAGE_URL_PREFIX@
use constant SEARCH_DIR          => '/www/Tools/Htdig/html';     #@SEARCH_DIR@
use constant SEARCH_FORM         => 'search.html';               #@SEARCH_FORM@

use constant VERSION => '0.2b';

$HtDig::DBG = 0;

##### Methods #####

### Sub new() ###
# $htdig = new HtDig( [Config=>config_name, Verbose=>[>=0]] );
###
sub new {
    my $class   = shift;
    my %params  = @_;
    my $config_name  = $params{'Config'};
    my $verbose = $HtDig::DBG = $params{'Verbose'};
    my $config = new HtDig::Config ( 
                                    'Config'  => $config_name, 
                                    'Verbose' => $verbose );
    if ( !$config ) {
        &dbg( undef, 0, "Pb reading Config \"$config_name\" - $! -" );
        return undef;
    }
    my $self = {
        'config'  => $config,
        'verbose' => $verbose,
    };
    bless $self, $class;
    $self->_init_from_conf();
    return $self;
} # end new();


### Sub config() ###
# $htdigconf = $htdig->config( [new config name] );
###
sub config {
    my ( $self, $config_name ) = @_;
    if ( $config_name ) {
        # new config asked
        my $newconf = new HtDig::Config ( 'Config'=>$config_name );
        if ( $newconf ) {
            # replace old by new 
            $self->{'config'} = $newconf;
            &_init_from_conf( $newconf );
        } else {
            $self->dbg( 1, "Failure accessing new config $config_name, previous unchanged" );
        }
    }
    return $self->{'config'};
} # end config();


### Sub doc_db() ###
# $htdigdocdb = $htdig->doc_db( [doc db file name, config] );
###
sub doc_db {
    my ($self, $docdbfile, $config) = @_;
    $config    ||= $self->{'config'};
    $docdbfile ||= $config->{'doc_db'};
    return HtDig::DB::Doc->new( 
                               'File'   => $docdbfile, 
                               'Config' => $config );
} # end doc_db();


### Sub doc_index() ###
# $htdigdocindex = $htdig->doc_index( [doc index file name, config] );
###
sub doc_index {
    my ($self, $docindexfile, $config) = @_;
    $config    ||= $self->{'config'};
    $docindexfile ||= $config->{'doc_index'};
    return HtDig::DB::DocIndex->new( 
                                    'File'   => $docindexfile, 
                                    'Config' => $config );
} # end doc_index();


### Sub word_db() ###
# $htdigworddb = $htdig->word_db( [word db file name, config] );
###
sub word_db {
    my ($self, $worddbfile, $config) = @_;
    $config     ||= $self->{'config'};
    $worddbfile ||= $config->{'word_db'};
    return HtDig::DB::Word->new ( 
                                 'File'   => $worddbfile, 
                                 'Config' => $config );
} # end word_db();


### Sub verbosity() ###
# $v = $htdig->verbosity( [level] );
###
sub verbosity {
    my ($self, $vlevel) = @_;
    if ( !defined $vlevel ) {
        if ( ref( $self )) {
            $vlevel = $self->{'verbose'};
        } else {
            $vlevel = $DBG;
        }
    } else {
        if ( ref( $self )) {
            $self->{'verbose'} = $vlevel;
        } else {
            $DBG = $vlevel;
        }
    }
    return $vlevel;
} # end verbosity();


sub dbg {
    my $self  = shift;
    my $level = shift;
    my @args  = @_;
    my $dbg   = 0;
    if ( $self && $self->{'verbose'}) {
        $dbg = $self->{'verbose'};
    } else {
        $dbg = $HtDig::DBG;
    }
    if ( $level < ($dbg -1)) {
        &Carp::cluck( "HtDig [$level]: ", @args );
    } elsif ( $level <= $dbg ) {
        &Carp::carp( "HtDig [$level]: ", @args );
    }
} # end dbg();


##### Private #####

sub _init_from_conf {
    my $self  = shift;
    my $config = $self->{'config'};
    # Init some few things
    my $locale = $config->{'locale'};
    if ( defined $locale ) {
        use POSIX qw(locale_h);
        my $oldlocale = setlocale( LC_CTYPE );
        if ( $oldlocale ne $locale ) {
            if ( setlocale( LC_CTYPE, $locale )) {
                $self->dbg( 1, "locale changed from $oldlocale to $locale" );
            } else {
                $self->dbg( 1, "locale not changed to $locale ... still $oldlocale - $! -" );
            }
        }
    }
} # end _init_from_conf();

## end of HtDig package



# ------------------------------------------------------------------------
# HtDig::Config: Interface to the htdig config
# ------------------------------------------------------------------------

package HtDig::Config;
use vars qw(@ISA @EXPORT_OK @EXPORT $AUTOLOAD);
require Exporter;
@HtDig::Config::ISA = qw(Exporter);


##### Private vars #####

# The configuration stolen from htcommon/defaults.cc
my %default_config = 
    (
     'bad_word_list'          => '${common_dir}/bad_words', 
     'bin_dir'                => HtDig::BIN_DIR, 
     'cgibin_dir'             => HtDig::CGIBIN_DIR, 
     'common_dir'             => HtDig::COMMON_DIR, 
     'common_url_parts'       => 'http:// http://www. ftp:// ftp://ftp. /pub/ .html .gif .jpg .jpeg /index.html /index.htm .com/ .com mailto:', 
     'compression_level'      => 0, 
     'config_dir'             => HtDig::CONFIG_DIR, 
     'config_file'            => HtDig::DEFAULT_CONFIG_FILE,
     'create_url_list'        => 0, 
     'create_images_list'     => 0, 
     'database_dir'           => HtDig::DATABASE_DIR, 
     'database_base'          => '${database_dir}/db', 
     'doc_db'                 => '${database_base}.docdb',
     'doc_index'              => '${database_base}.docs.index',
     'doc_list'               => '${database_base}.docs',
     'endings_affix_file'     => '${common_dir}/english.aff', 
     'endings_dictionary'     => '${common_dir}/english.0',
     'endings_root2word_db'   => '${common_dir}/root2word.db',
     'endings_word2root_db'   => '${common_dir}/word2root.db',
     'excerpt_length'         => 300, 
     'excerpt_show_top'       => 0, 
     'image_dir'              => HtDig::IMAGE_DIR, 
     'image_list'             => '${database_base}.images',
     'image_url_prefix'       => HtDig::IMAGE_URL_PREFIX, 
     'iso_8601'               => 0, 
     'logging'                => 0, 
     'metaphone_db'           => '${database_base}.metaphone.db', 
     'minimum_word_length'    => 3, 
     'nothing_found_file'     => '${common_dir}/nomatch.html',
     'robotstxt_name'         => 'htdig', 
     'search_form'            => HtDig::SEARCH_DIR. '/' . HtDig::SEARCH_FORM, 
     'search_results_footer'  => '${common_dir}/footer.html', 
     'search_results_header'  => '${common_dir}/header.html', 
     'search_results_wrapper' => '', 
     'server_aliases'         => '', 
     'start_url'              => 'http://www/', 
     'sort'                   => 'score', 
     'sort_names'             => 'score Score time Time title Title revscore \'Reverse Score\' revtime \'Reverse Time\' revtitle \'Reverse Title\'', 
     'soundex_db'             => '${database_base}.soundex.db', 
     'star_blank'             => '${image_url_prefix}/star_blank.gif',
     'star_image'             => '${image_url_prefix}/star.gif',
     'star_patterns'          => '', 
     'substring_max_words'    => 25, 
     'synonym_dictionary'     => '${common_dir}/synonyms', 
     'synonym_db'             => '${common_dir}/synonyms.db', 
     'syntax_error_file'      => '${common_dir}/syntax.html', 
     'template_map'           => 'Long builtin-long builtin-long Short builtin-short builtin-short', 
     'template_name'          => 'builtin-long',
     'uncoded_db_compatible'  => 1, 
     'url_list'               => '${database_base}.urls', 
     'url_log'                => '${database_base}.log', 
     'url_part_aliases'       => '', 
     'use_meta_description'   => 0, 
     'user_agent'             => 'htdig',
     'valid_punctuation'      => '.-_/!#$%^&*\'', 
     'word_db'                => '${database_base}.words.db', 
     'word_list'              => '${database_base}.wordlist',
     );

my %expansion_done = undef;
my %included = undef;


##### Methods #####

### Sub new() ###
# $htdigconfig = $htdig->config( ... ); Use It or
# $htdigconfig = new HtDig::Config(Config=>name|File=>file)
###
sub new {
    my $class  = shift;
    my %params = @_;
    my $config_name = $params{'Config'};
    my $config_file = $params{'File'};
    if ( $config_file ) {
        # read from config file
        # Allow config file to end with something != .conf (for ex .conf.~1~ is ok)
        ($config_name) = $config_file =~ /\/([^\/]+)\.conf/;
    } elsif ( !$config_name ) {
        # use default
        ($config_name) = HtDig::DEFAULT_CONFIG_FILE =~ /\/([^\/]+)\.conf$/;
        $config_file   = HtDig::DEFAULT_CONFIG_FILE;
    } elsif ( $config_name =~ /[\/]/ ) {
        # invalid config name
        &HtDig::dbg( undef, 0, "Invalid config name \"$config_name\"" );
        return undef;
    } elsif ( $config_name eq '__DEFAULT__' ) {
        # invalid config name
        &HtDig::dbg( undef, 0, "Using default config name from HtDig.pm" );
        $config_file = '';
    } else {
        # get from config name
        $config_file = $default_config{'config_dir'} . '/' . $config_name . '.conf';
    }
    # Create the config object
    my $self = {
        %default_config, 
    };
    bless $self, $class;
    $self->{'config_name'} = $config_name;
    $self->{'config_file'} = $config_file;
    # read all the config files and store results in %the_config
    %expansion_done = %included = undef;
    if ( $self->_parse_configfile( $config_file ) == 0 ) {
        &HtDig::dbg( undef, 0, "Reading configuration \"$config_name\" failed" );
        return undef;
    }
    # we should now expand all unexpanded vars 
    &HtDig::dbg( undef, 3, "Config Expanding vars ..." );
    while ( my ($k, $v) = each %$self ) {
        $self->_expand_val( $k, $v );
    }
    return $self;
} # end new();


### Sub keys() ###
# @keys = $htdigconf->keys();
###
sub keys {
    my $self = shift;
    return keys %$self;
} # end keys();


### Sub AUTOLOAD() ###
# AUTOLOAD( );
# $val = $htdigconf->key
###
sub AUTOLOAD {
    my $self = shift;
    my $name = $AUTOLOAD =~ s/.*:://;
    return undef if $name eq 'DESTROY';
    return $self->{$name};
} # end AUTOLOAD();


##### Internals #####

sub _parse_configfile {
    my ($self, $configfile) = @_;
    &HtDig::dbg( undef, 2,  
                   "Config: Openning File \"$configfile\"");
    return 1 if ( !$configfile || $included{$configfile} );
    my $fh = new FileHandle $configfile, "r";
    if ( !defined $fh ) {
        &HtDig::dbg( undef, 0, "Config file $configfile unreadable - $! -" );
        return 0;
    }
    &HtDig::dbg( undef, 3, "Config: Parsing File \"$configfile\" ..." );
    my ($keyword, $val, $continued, $line, $linenum) = undef;
    while ($line = <$fh>) {
        $linenum++;
        # cleanup the line
        chomp $line;
        $line =~ s/^\s+//;
        $line =~ s/\s+$//;
        next if $line =~ /^(#|$)/;
        &HtDig::dbg( undef, 4, "Config: Parsing Line \"$line\" ..." );
        my ($bs) = $line =~ /\s*(\\)$/;
        $line =~ s/\s*\\?$//;
        # the line had been cleaned up now use it
        if ( $continued ) {
            &HtDig::dbg( undef, 5, "\t\tnext... \"$line\" $bs" );
            if ( $bs eq '\\' ) {
                $val .= ' ' . $line;
                $continued = 1;
                next;
            } else {
                # value terminated
                $val .= ' ' . $line;
                $self->_parse_val( $keyword, $val, $configfile, $linenum );
                ($keyword, $val, $continued) = 0;
                next;
            }
        } else {
            my ( $k, $v ) = $line =~ /(\w+)\s*:\s*(.*)$/;
            $keyword = $k;
            $v =~ s/^\s+//;
            $v =~ s/\s+$//;
            &HtDig::dbg( undef, 5, "\t\tfirst... $k \"$v\" $bs" );
            $val = $v;
            if ( $bs eq '\\' ) {
                $continued = 1;
                next;
            } elsif ( $keyword ) {
                # value terminated
                $self->_parse_val( $keyword, $val, $configfile, $linenum );
                ($keyword, $val, $continued) = 0;
                next;
            } else {
                ($keyword, $val, $continued) = 0;
                next;
            }
        }
    }
    $included{$configfile} = 1;
    &HtDig::dbg( undef, 3, "Config: Parsing File \"$configfile\" ...done" );
    return 1;
} # end _parse_configfile();


sub _parse_val {
    my ( $self, $k, $v, $cf, $ln ) = @_;
    &HtDig::dbg( undef, 4, "Config: Parsing Val $k: \"$v\"" );
    $k = lc $k;
    $v =~ s/^\s+//;
    $v =~ s/\s+$//;
    # check for includes
    if ( $k eq 'include' ) {
        if ( $v !~ /^\// ) {
            # append config dir 
            $v = $self->{'config_dir'} . '/' . $v;
        }
        # val should be expanded now so that we can read the file
        $v = $self->_expand_val( undef, $v );
        if ( !$self->_parse_configfile( $v )) {
            &HtDig::dbg( undef, 0, "Config: $cf, line $ln. Unable to include file $v" );
        }
    } else {
        if ( $v =~ /^(yes|true)$/i ) {
            $v = 1;
        } elsif ( $v =~ /^(no|false)$/i ) {
            $v = 0;
        }
        $self->{ $k } = $v;
        &HtDig::dbg( undef, 4, "Config: Adding $k: \"$v\"" );
    }
} # end _parse_val();


sub _expand_val {
    my ( $self, $k, $v ) = @_;
    &HtDig::dbg( undef, 4, "Config: Expanding Val $k: \"$v\"" );
    if ( $k ) {
        return $self->{$k} if ( $expansion_done{$k} );
    }
    while ( $v =~ /\${(\w+)}/ ) {
        my $k2 = lc $1;
        &HtDig::dbg( undef, 5, "Config: Expanding $k2 in $k: \"$v\"" );
        my $v2 = $self->_expand_val( $k2, $self->{$k2} );
        $v  =~ s/(\${$k2})/$v2/;
    }
    if ( $k ) {
        $self->{$k} = $v;
        $expansion_done{$k} = 1;
    }
    return $v;
} # end _expand_val();

## end of HtDig::Config package



# ------------------------------------------------------------------------
# package HtDig::DBRecord: 
# ------------------------------------------------------------------------

package HtDig::DBRecord;
# Records of HtDigs DB
use vars qw(@ISA @EXPORT_OK @EXPORT);
require Exporter;
@HtDig::Doc::ISA = qw(Exporter);


### Sub new() ###
sub new {
    my $class      = shift;
    my %params     = @_;
    my $self       = {
        '_valid' => 0, 
        '_KEY'   => $params{'Key'}, 
    };
    return bless $self, $class;
} # end new();


### Sub key() ###
# key( );
###
sub key {
    my ($self, $key) = @_;
    if ( $key ) {
        return $self->{'_KEY'} = $key;
    } else {
        return $self->{'_KEY'};
    }
} # end key();


### Sub validity() ###
# validity( );
###
sub validity {
    my ($self, $validity) = @_;
    if ( $validity ) {
        return $self->{'_valid'} = $validity;
    } else {
        return $self->{'_valid'};
    }
} # end validity();


### Sub print() ###
# print( );
###
sub print {
    my ($self, $long) = @_;
    if ( !$self->{'_valid'} ) {
        print $self =~ /^([^=]+)/ , " invalid object, ";
        if ( !$long ) {
            print "\n"; 
            return;
        }
    }
    if ( defined $self->{'_KEY'} ) {
        printf ( "KEY: %-20s", $self->{'_KEY'} );
    } else {
        print "KEY: _undefined_";
    }
    print "\n";
    return unless $long;
    foreach (my ($key, $ref) = each %$self ) {
        next if $key =~ /^_KEY$/;
        print "\t$key\t=> ";
        $self->_print_any( $ref, "\t\t" );
        print "\n";
    }    
} # end print();


sub _print_any {
    my ($self, $ref, $tab) = @_;
    $tab .= "\t";
    if ( ref( $ref ) =~ /ARRAY/ ) {
        my $i = 0;
        print '/ARRAY/ #', scalar @$ref;
        foreach ( @$ref ) {
            print "\n$tab\[$i\] ";
            $self->_print_any( $_, $tab );
            $i++;
        }
    } elsif  ( ref( $ref ) =~ /HASH/ ) {
        print "/HASH/ ";
        foreach ( keys %$ref ) {
            print "\n$tab$_ => ";
            $self->_print_any( $$ref{$_}, $tab );
        }
    } elsif  ( ref( $ref ) =~ /SCALAR/ ) {
        print "/SCALAR/ $$ref", 
    } else {
        print $ref;
    }
} # end _print_any();

## end of HtDig::DBRecord package



# ------------------------------------------------------------------------
# package HtDig::DBRecord::Doc: 
# ------------------------------------------------------------------------

package HtDig::DBRecord::Doc;
# Records of Docs DB
use vars qw(@ISA @EXPORT_OK @EXPORT);
require Exporter;
@HtDig::DBRecord::Doc::ISA = qw(HtDig::DBRecord);


### Sub new() ###
sub new {
    my $class      = shift;
    my %params     = @_;
    my $self       = $class->SUPER::new( %params );
    return bless $self, $class;
} # end new();


### Sub id() ###
# id( );
###
sub id {
    my ($self, $id) = @_;
    if ( $id ) {
        return $self->{'Id'} = $id;
    } else {
        return $self->{'Id'};
    }
} # end id();


### Sub url() ###
# url( );
###
sub url {
    my ($self, $url) = @_;
    if ( defined $url ) {
        return $self->{'Url'} = $url;
    } else {
        return $self->{'Url'};
    }
} # end url();


### Sub title() ###
# title( );
###
sub title {
    my ($self, $title) = @_;
    if ( $title ) {
        return $self->{'Title'} = $title;
    } else {
        return $self->{'Title'};
    }
} # end title();


### Sub size() ###
# size( );
###
sub size {
    my ($self, $size) = @_;
    if ( $size ) {
        return $self->{'Size'} = $size;
    } else {
        return $self->{'Size'};
    }
} # end size();


### Sub description() ###
# description( );
###
sub description {
    my ($self) = @_;
    my $result = undef;
    # try first meta description
    $result = $self->{'METADSC'};
    if ( defined $result && length $result > 0 ) {
        $result =~ s/(\s+)/ /mg;
        return $result;
    }
    # Extract the top of the HEAD field (3 lines 80 chr is enought)
    my $head = $self->{'HEAD'};
    my $lh   = length $head;
    $lh = 240 if ( $lh > 240 );
    $result = substr $head, 0, $lh;
    if ( defined $result && length $result > 0 ) {
        $result =~ s/(\s+)/ /mg;
        return $result;
    }
    # then concat the descriptions field
    my $rdesc = $self->{'DESCRIPTIONS'};
    foreach ( @$rdesc ) {
        chop;
        $result .= $_ . ", ";
    }
    if ( defined $result && length $result > 0 ) {
        $result =~ s/(\s+)/ /mg;
        return $result;
    }
    # Title
    $result = $self->title();
    if ( defined $result && length $result > 0 ) {
        $result =~ s/(\s+)/ /mg;
        return $result;
    }
    # URL
    my ($path) = $self->url() =~ /:\/\/[^\/]+\/([^\?]*)/;
    foreach ( reverse split '\/', $path ) {
        $result .= $_ . ", ";
    }
    if ( defined $result && length $result > 0 ) {
        $result =~ s/(\s+)/ /mg;
        return $result;
    }
    # what to do else ???
    return $result;
} # end description();


### Sub print() ###
# print( );
###
sub print {
    my ($self, $long) = @_;
    if ( $self->validity ) {
        printf ( "%s> [%-20s]:\t(id %3i)\n", $self =~ /^([^=]+)/, $self->url, $self->id );
        return unless $long;
        my $key = undef;
        foreach $key ( keys %$self ) {
            my $ref = $self->{$key};
            next if $key =~ /^(Url|Id|_valid)$/;
            printf "\t%-20s => ", $key;
            $self->_print_any( $ref, "\t\t" );
            print "\n";
        }
    } elsif ( $long ) {
        $self->SUPER::print( $long );
    }
} # end print();

## end of HtDig::DBRecord::Doc package



# ------------------------------------------------------------------------
# package HtDig::DBRecord::Word: 
# ------------------------------------------------------------------------

package HtDig::DBRecord::Word;
# Records of Words DB
use vars qw(@ISA @EXPORT_OK @EXPORT);
require Exporter;
@HtDig::DBRecord::Word::ISA = qw(HtDig::DBRecord);


### Sub new() ###
#sub new {
#    my $class      = shift;
#    my %params     = @_;
#    my $self       = $class->SUPER::new( %params );
#    return bless $self, $class;
#} # end new();
### Sub new() ###


### Sub word() ###
# word( );
###
sub word {
    my ($self, $word) = @_;
    if ( $word ) {
        return $self->{'Word'} = $word;
    } else {
        return $self->{'Word'};
    }
} # end word();


### Sub ndocs() ###
# ndocs( );
###
sub ndocs {
    my ($self, $nb) = @_;
    if ( $nb ) {
        return $self->{'NDOCS'} = $nb;
    } else {
        return $self->{'NDOCS'};
    }
} # end ndocs();


### Sub docs_ids() ###
# @ids = $word->docs_ids( );
###
sub docs_ids {
    my ($self) = @_;
    my @ids;
    if ( $self->ndocs() > 0 ) {
        my $doc;
        foreach $doc ( @{$self->{'DOCS'}} ) {
            if ( $doc->{'ID'} ) {
                push @ids, $doc->{'ID'};
            }
        }
    } 
    return @ids;
} # end docs_ids();


### Sub print() ###
# $word->print( [long] );
###
sub print {
    my ($self, $long) = @_;
    if ( $self->validity ) {
        printf ( "%-20s:\t%3d time in %2i doc(s)/id(s): ", $self->word(), $self->{'TOTAL'}, $self->ndocs());
        foreach ( $self->docs_ids()) {
            print "$_, ";
        }
        print "\n";
        return unless $long;
        my $key = undef;
        foreach $key ( keys %$self ) {
            my $ref = $self->{$key};
            next if $key =~ /^(Word|NDOCS|TOTAL)$/;
            printf "\t%-20s => ", $key;
            $self->_print_any( $ref, "\t\t" );
            print "\n";
        }
    } elsif ( $long ) {
        $self->SUPER::print( $long );
    }
} # end print();


## end of HtDig::DBRecord::Word package



# ------------------------------------------------------------------------
# package HtDig::DBRecord::DocIndex: 
# ------------------------------------------------------------------------

package HtDig::DBRecord::DocIndex;
# Records of DocIndex DB
use vars qw(@ISA @EXPORT_OK @EXPORT);
require Exporter;
@HtDig::DBRecord::DocIndex::ISA = qw(HtDig::DBRecord);


### Sub new() ###
#sub new {
#    my $class      = shift;
#    my %params     = @_;
#    my $self       = $class->SUPER::new( %params );
#    return bless $self, $class;
#} # end new();
### Sub new() ###

### Sub url() ###
# url( );
###
sub url {
    my ($self, $url) = @_;
    if ( $url ) {
        return $self->{'Url'} = $url;
    } else {
        return $self->{'Url'};
    }
} # end url();


### Sub id() ###
# id( );
###
sub id {
    my ($self, $id) = @_;
    if ( $id ) {
        return $self->{'Id'} = $id;
    } else {
        return $self->{'Id'};
    }
} # end id();


### Sub print() ###
# print( );
###
sub print {
    my ($self, $long) = @_;
    if ( $self->validity ) {
        printf ( "%3d :\t%20s\n", $self->id(), $self->url(), );
        return unless $long;
        my $key = undef;
        foreach $key ( keys %$self ) {
            my $ref = $self->{$key};
            next if $key =~ /^(Url|Id)$/;
            printf "\t%-20s => ", $key;
            $self->_print_any( $ref, "\t\t" );
            print "\n";
        }
    } elsif ( $long ) {
        $self->SUPER::print( $long );
    }
} # end print();


## end of HtDig::DBRecord::DocIndex package



# ------------------------------------------------------------------------
# package HtDig::DB: Abstract class for the HtDig database
# ------------------------------------------------------------------------

package HtDig::DB;
# Pseudo Pure virtual class
# Sub Classes MUST override _(de)serialize_data method
require BerkeleyDB;
require Exporter;
use vars qw(@ISA @EXPORT_OK @EXPORT);
@HtDig::DB::ISA = qw(Exporter);
@HtDig::DB::EXPORT_OK = qw(the_db process);


### Sub new() ###
# $db = new( File=>dbfile, [Flags=>flags] );
# For use from sub classes only!!!
###
sub new {
    my $class        = shift;
    my %params       = @_;
    my $self         = {};
    my $file         = $self->{'File'} = $params{'File'};
    $self->{'Flags'} = $params{'Flags'} || $BerkeleyDB::DB_RDONLY;
    my %DB           = undef;
    tie( %DB, 
        'BerkeleyDB::Btree', 
        '-Filename' => $file, 
        '-Flags'    => $self->{'Flags'}, 
        ) || Carp::confess( "Error: ",  $file, " - $! -" );
    &HtDig::dbg( undef, 2, "DB: Tied ", $file, " to DB" );
    $self->{'DB'} = \%DB;
    return bless $self, $class;
} # end new();


### Sub the_db() ###
# $tiedhash = $db->the_db( );
###
sub the_db {
    my ($self) = @_;
    return $self->{'DB'};
} # end the_db();


### Sub writeable() ###
# $db->writeable();
###
sub writeable {
    my ($self) = @_;
    if ( $self->{'Flags'} == $BerkeleyDB::DB_RDONLY ) {
        return 0;
    } else {
        return 1;
    }
} # end writeable();


### Sub value() ###
# $record = $db->value( $key );
###
sub value {
    my ( $self, $key ) = @_;
    my $value = $$self->{'DB'}{$key};
    if ( defined $value ) {
        $value = $self->_deserialize_data( $key, $value );
    }
    return $value;
} # end value();


### Sub process() ###
# $db->process( {sub} );
# Replace foreach (($k, $v) = each %DB ){ sub{$v) }
###
sub process {
    my ( $self, $func ) = @_;
    my ( $key, $value, $obj_ref ) = undef;
    &HtDig::dbg( undef, 3, "DB: Processing File ",  $self->{'File'}, " ..." );
    while (( $key, $value ) = each %{$self->{'DB'}} ) {
        &HtDig::dbg( undef, 4, "DB: Processing Key \"$key\"..." );
        $obj_ref = $self->_deserialize_data( $key, $value );
        #print "Eval w/ $key\n";
        if ( $obj_ref && $func ) { eval &$func( $obj_ref )};
    }
    &HtDig::dbg( undef, 3, "DB: Processing File ",  $self->{'File'}, " ...done" );
} # end process();



##### Internals #####

### Sub _deserialize_data() ###
# $record = $db->_deserialize_data( $key, $rawdata );
# This made this class pure virtual, 
# only subclasses overriding this method should be instantiated!!
###
sub _deserialize_data {
    &Carp::confess( "This method shoud have been overridden in sub classes!!!" );
    return undef;
} # end _deserialize_data();


### Sub _serialize_data() ###
# $rawdata = $db->_serialize_data( $key, $record );
# This made this class pure virtual, 
# only subclasses overriding this method should be instantiated!!
###
sub _serialize_data {
    &Carp::confess( "This method shoud have been overridden in sub classes!!!" );
    return undef;
} # end _serialize_data();


use constant CHARSIZE_MARKER_BIT  => 64;
use constant SHORTSIZE_MARKER_BIT => 128;
my %types_map = 
    (
     'char'   => ["c", length pack( "c", undef )], 
     'uchar'  => ["C", length pack( "C", undef )], 
     'short'  => ["s", length pack( "s", undef )], 
     'ushort' => ["S", length pack( "S", undef )], 
     'int'    => ["i", length pack( "i", undef )], 
     'uint'   => ["I", length pack( "I", undef )], 
     'long'   => ["l", length pack( "l", undef )],
     'ulong'  => ["L", length pack( "L", undef )],
     );


sub _sizeof ( $ ) {
    return @{$types_map{@_[0]}}[1];
} # end _sizeof();


sub _getnum ( $$$ ) {
    my ( $type, $rv, $ctype )   = @_;
    my ( $templ, $len );
    if ( $type & CHARSIZE_MARKER_BIT ) {
        ( $templ, $len ) = @{$types_map{ 'uchar' }};
    } elsif ( $type & SHORTSIZE_MARKER_BIT ) {
        ( $templ, $len ) = @{$types_map{ 'ushort' }};
    } else {
        ( $templ, $len ) = @{$types_map{ $ctype }};
    }
    my $v = unpack( $templ, $$rv );
    &HtDig::dbg( undef, 6, "DB: Got $v - $templ, $len, $ctype - $type" );
    $$rv  = substr( $$rv, $len );
    return $v;
} # end _getnum();


sub _getstring ( $$$ ) {
    my ( $type, $rv, $length ) = @_;
    # get encoded string length
    $length ||= &_getnum( $type, $rv, 'int' );
    &HtDig::dbg( undef, 5, "DB: Got string $length - $type" );
    my $v      = unpack "A$length", $$rv;
    $$rv       = substr( $$rv, $length );
    return $v;
} # end _getstring();

## end of HtDig::DB::Doc package


sub _getlist ( $$ ) {
    my ( $type, $rv ) = @_;
    # ent encoded elements count
    my $count   = &_getnum( $type, $rv, 'int' );
    &HtDig::dbg( undef, 5, "DB: Got list $count - $type" );
    my (@result, $s, $i);
    if ( $type & (CHARSIZE_MARKER_BIT|SHORTSIZE_MARKER_BIT)) {
        my $c;
        for ( $i = 0; $i < $count; $i++ ) {
            $c = &_getnum( ~(CHARSIZE_MARKER_BIT|SHORTSIZE_MARKER_BIT), $rv, 'uchar' );
            if ( $c < ~1 ) {
                $s = &_getstring( ~(CHARSIZE_MARKER_BIT|SHORTSIZE_MARKER_BIT), $rv, $c );
            } else {
                $s = &_getstring( ~(CHARSIZE_MARKER_BIT|SHORTSIZE_MARKER_BIT), $rv );
            }
            $s =~ s/^\s+//;
            $s =~ s/\s+$//;
            next if (!$s || $s !~ /\w/);
            #$result[$idx++] = $s;
            push( @result, $s );
        }
    } else {
        for ( $i = 0; $i < $count; $i++ ) {
            $s = &_getstring( ~(CHARSIZE_MARKER_BIT|SHORTSIZE_MARKER_BIT), $rv );
            $s =~ s/^\s+//;
            $s =~ s/\s+$//;
            next if (!$s || $s !~ /\w/);
            #$result[$idx++] = $s;
            push( @result, $s );
        }
    }
    return \@result;
} # end _getlist();


## end of HtDig::DB package



# ------------------------------------------------------------------------
# package HtDig::DB::Doc: Interface to the docdb database
# ------------------------------------------------------------------------

package HtDig::DB::Doc;
# Contain HtDig::DBRecord::Doc records
use vars qw(@ISA @EXPORT);
require Exporter;
@HtDig::DB::Doc::ISA  = qw(HtDig::DB);
@HtDig::DB::Doc::EXPORT = qw();

##### Exported #####

### Sub new() ###
# new( );
###
sub new {
    my $class      = shift;
    # get parameters
    my %params     = @_;
    my $config     = $params{'Config'};
    if ( !defined $config ) {
        &HtDig::dbg( undef, 0, "DB::Doc: Parameter Config is mandatory" );
        return undef;
    }
    my $file = $params{'File'} || $config->{'doc_db'};
    $params{'File'} = $file;
    # create the self object
    my $self = $class->SUPER::new( %params );
    return undef unless ( defined $self );
    bless $self, $class;
    # inits
    $self->{'Config'} = $config;
    if ( !$self->_init( $config )) {
        &HtDig::dbg( undef, 1, "DB::Doc: Initializations failed" );
        return undef;
    }
    return $self;
} # end new();


##### Internals #####


sub _init {
    my ( $self, $config ) = @_;
    
    # Init compression scheme
    my $comp = $config->{'compression_level'};
    if ( $comp < 0 ) {
        $comp = $config->{'compression_level'} = 0;
    } elsif ( $comp > 9 ) {
        $comp = $config->{'compression_level'} = 9;
    }
    if ( $comp > 0 ) {
        &HtDig::dbg( undef, 2, "DB::Doc: Using `compressed' db (lev: $comp)" );
        # use compression with Zlib
        use Compress::Zlib;
        # Build decompression stuff
        my ($zi, $status) = &Compress::Zlib::inflateInit();
        if ( $status != Z_OK ) {
            &HtDig::dbg( undef, 0, "DB::Doc: Zlib::inflateInit Failure - $status -" );
            return undef;
        } else {
            &HtDig::dbg( undef, 2, "DB::Doc: Uncompressing with Zlib" );
            # build store the uncompressing function for further usage in
            # the deserialize stream
            $self->{'_uncompress'} = sub { 
                my $rv = shift;
                my $v  = "";
                # I really don't understand why this should be done
                # each time !!!!?
                my ($zi, $status) = &Compress::Zlib::inflateInit();
                if ( $status != Z_OK ) {
                    &HtDig::dbg( undef, 0, "DB::Doc: Zlib::inflateInit Failure - $status -" );
                    return undef;
                }
                do {
                    my ($out, $status) = $zi->inflate( $rv );
                    $v .= $out;
                    return $v if ( $status == Z_STREAM_END );
                    next if ( $status == Z_OK );
                    last;
                } while(1);
                &HtDig::dbg( undef, 1, "DB::Doc: Inflate failure - $status -" );
                return undef; }
        }
        # Build Compression stuff
        # TODO ...
        $self->{'_compress'} = sub{ return ${@_[0]}; };#ident - no compression
    } else {
        # No compression used
        &HtDig::dbg( undef, 2, "DB::Doc: Using `flat' db" );
        $self->{'_uncompress'} = $self->{'_compress'} = sub{ return ${@_[0]}; };#ident - no compression
    }
    
    # Init url parts stuff
    # From a string "pat1 repl1 pat2 repl2 ... patn repln" we 
    # build a hash of { repli => pati ...} to easily build
    # with that a compiled pattern matching/replace expression.
    my @url_part_aliases = split( /\s+/, $config->{'url_part_aliases'} );

    my %encode_parts     = @url_part_aliases;
    my %decode_parts     = reverse @url_part_aliases;
    my @common_url_parts = split( /\s+/, $config->{'common_url_parts'});
    # I should work with common_url_parts here but that's still unknown to me
    # TODO...
    if ( $#common_url_parts > 0 ) {
       &HtDig::dbg( undef, 0, 
                   "DB::Docs: Expansion of \`common_url_parts\' not implemented yet!!", 
                   " Some STRANGE RESULTS are predictible now !!!" );
    }
    my ( $i, $internal_encoding, $cup ) = 0;
    while ( $cup = @common_url_parts ) {
        $cup =~ s/^\s+//;
        $cup =~ s/\s+$//;
        next if ( $cup !~ /\w/ );
        # Build a replacement pattern here
        $internal_encoding = &_getencoding( $i++ );
        # TODO
    }

    # Build the decoder
    my ($decoder, $k, $v);
    foreach $k ( keys %decode_parts ) {
        $k =~ s/^\s+//;
        $k =~ s/\s+$//;
        next if ( $k !~ /\w/ );
        $v = $decode_parts{$k};
        $v =~ s/^\s+//;
        $v =~ s/\s+$//;
        $decoder .= '$v =~ s/';
        $decoder .= quotemeta( $k );
        $decoder .= '/';
        $decoder .= $v;
        $decoder .= '/go;';
    }
    $self->{'_decode'} = $decoder;
    &HtDig::dbg( undef, 4, "DB::Doc: Decoding urls /w: \"", $self->{'_decode'}, "\"" );
    
    # Build the encoder
    # TODO ...
    my $encoder;
    $self->{'_encode'} = $encoder;
    &HtDig::dbg( undef, 4, "DB::Doc: Encoding urls with \"$encoder\"" );
    
    return 1;
} # end _init();


use constant FIRST_INTERNAL_SINGLECHAR => 7;
use constant LAST_INTERNAL_SINGLECHAR  => 31;

sub _getencoding ( $$ ) {
    my ($i, $cup) = @_;
    # Should build the internal encoding as
    # in HtWordCodec.cc
    my $number_to_store = $i + FIRST_INTERNAL_SINGLECHAR;
    if ( $number_to_store <= LAST_INTERNAL_SINGLECHAR ) {
        return $number_to_store;
    } else {
        $number_to_store -= LAST_INTERNAL_SINGLECHAR;
        my $to_store = pack "C", $number_to_store;
        # TODO
        return $to_store;
    }
} # end _getencoding();


sub _deserialize_data {
    my ( $self, $key, $value ) = @_;
    return undef if ( $key =~ /^nextDocID/ );
    my ( $type, $what, $v );
    # reset record
    my $record = new HtDig::DBRecord::Doc( 'Key' => $key );
    while ( length( $value ) > 0 ) {
        # type = (unsigned char)*value++;
	$type  = unpack "C", $value;
        $value = substr $value, 1;
        # check for format
	$what  = $type & ~(HtDig::DB::CHARSIZE_MARKER_BIT|HtDig::DB::SHORTSIZE_MARKER_BIT);
        &HtDig::dbg( undef, 4, "DB::Doc: What $what / Type $type" );
        # see val of enums in htcommon/DocumentRef.cc
        # and see types in common/DocumentRef.h
	if ( $what == 0 )
	{
	    # ID - int
            $v = &HtDig::DB::_getnum( $type, \$value, 'int' );
	    $record->id( $v );
	}
	elsif ( $what == 1 )
	{
	    # TIME - time_t - long
	    $record->{'TIME'} = &HtDig::DB::_getnum( $type, \$value, 'long' );
	}
	elsif ( $what == 2 )
	{
	    # ACCESSED - time_t - long
	    $record->{'ACCESSED'} = &HtDig::DB::_getnum( $type, \$value, 'long' );
	}
	elsif ( $what == 3 )
	{
	    # STATE - ReferenceState - enum => int
            $v = &HtDig::DB::_getnum( $type, \$value, 'int' );
            # enum mapping from DocumentRef.h
            if ( $v == 0 ) {
                $v = 'Normal';
            } elsif ( $v == 1 ) {
                $v = 'NotFound';
            } elsif ( $v == 2 ) {
                $v = 'NoIndex';
            } else {
                $v = "Unknown:$v";
            }
	    $record->{'STATE'} = $v;
	}
	elsif ( $what == 4 )
	{
	    # SIZE - int 
	    $record->size( &HtDig::DB::_getnum( $type, \$value, 'int' ));
	}
	elsif ( $what == 5 )
	{
	    # LINKS - int
	    $record->{'LINKS'} = &HtDig::DB::_getnum( $type, \$value, 'int' );
	}
	elsif ( $what == 6 )
	{
	    # IMAGESIZE - int
	    $record->{'IMAGESIZE'} = &HtDig::DB::_getnum( $type, \$value, 'int' );
	}
	elsif ( $what == 7 )
	{
	    # HOPCOUNT - int
	    $record->{'HOPCOUNT'} = &HtDig::DB::_getnum( $type, \$value, 'int' );
	}
	elsif ( $what == 8 )
	{
	    # URL - String
            $v = &HtDig::DB::_getstring( $type, \$value );
            # _decode do stuff on $v, see _init function.
            #&HtDig::dbg( undef, 1, "DB::Doc: Decoding \"$v\" /w: \"", $self->{'_decode'}, "\"" );
            eval $self->{'_decode'};
	    $record->url( $v );
	}
	elsif ( $what == 9 )
	{
	    # HEAD - String
            $v = &HtDig::DB::_getstring( $type, \$value );
            # heads should be compressed, see _init function
	    $record->{"HEAD"} = &{$self->{'_uncompress'}}( \$v );
	}
	elsif ( $what == 10 )
	{
	    # TITLE - String
	    $record->title( &HtDig::DB::_getstring( $type, \$value ));
	}
	elsif ( $what == 11 )
	{
	    # DESCRIPTIONS - List
	    $record->{"DESCRIPTIONS"} =  &HtDig::DB::_getlist( $type, \$value );
	}
	elsif ( $what == 12 )
	{
	    # ANCHORS - List
	    $record->{"ANCHORS"} = &HtDig::DB::_getlist( $type, \$value );
	}
	elsif ( $what == 13 )
	{
	    # EMAIL - String
	    $record->{"EMAIL"} = &HtDig::DB::_getstring( $type, \$value );
	}
	elsif ( $what == 14 )
	{
	    # NOTIFICATION - String
	    $record->{"NOTIFICATION"} = &HtDig::DB::_getstring( $type, \$value );
	}
	elsif ( $what == 15 )
	{
	    # SUBJECT - String
	    $record->{"SUBJECT"} = &HtDig::DB::_getstring( $type, \$value );
	}
 	elsif ( $what == 16 )
 	{
 	    # STRING (ignore, but unpack anyway)
 	    $record->{"STRING"} = &HtDig::DB::_getstring( $type, \$value );
 	}
 	elsif ( $what == 17 )
 	{
 	    # METADSC - String
 	    $record->{"METADSC"} =  &HtDig::DB::_getstring( $type, \$value );
 	}
 	elsif ( $what == 18 )
 	{
 	    # BACKLINKS - int
 	    $record->{"BACKLINKS"} = &HtDig::DB::_getnum( $type, \$value, 'int' );
 	}
 	elsif ( $what == 19 )
 	{
 	    # SIGNATURE - long int
 	    $record->{"SIG"} = &HtDig::DB::_getnum( $type, \$value, 'long' );
 	} 
        else
        {
            &HtDig::dbg( undef, 1, "DB::Doc: UNKNOWN What $what / Type $type" );
            $record->validity( 0 );
            return $record;
        }
    }
    # Mark record as valid and exit
    $record->validity( 1 );
    return $record;
} # end _deserialize_data();




# ------------------------------------------------------------------------
# HtDig::DB::Word: Interface to the worddb database
# ------------------------------------------------------------------------

package HtDig::DB::Word;
# Contain HtDig::DBRecord::Word records
use vars qw(@ISA);
@HtDig::DB::Word::ISA = qw(HtDig::DB);

##### Exported #####

### Sub new() ###
# new( );
###
sub new {
    my $class      = shift;
    # get parameters
    my %params     = @_;
    my $config     = $params{'Config'};
    if ( !defined $config ) {
        &HtDig::dbg( undef, 0, "DB::Word: Parameter Config is mandatory" );
        return undef;
    }
    my $file = $params{'File'} || $config->{'word_db'};
    $params{'File'} = $file;
    # create the self object
    my $self = $class->SUPER::new( %params );
    return undef unless ( defined $self );
    bless $self, $class;
    # inits
    $self->{'Config'} = $config;
    return $self;
} # end new();


##### Internals #####

sub _deserialize_data {
    my ( $self, $key, $value ) = @_;
    my ( $length, $count, $id, $weight, $anchor, $location );
    # reset record
    my $record = new HtDig::DBRecord::Word( 'Key' => $key );
    $record->word( $key );
    # extracted from wordfreq.pl
    $length = length( $value ) / 20;
    $record->{'TOTAL'} = 0;
    $record->{'NDOCS'} = 0;
    $record->{'DOCS'}  = [];
    my $i = 0;
    foreach $i ( 0 .. $length - 1 ) {
	($count, $id, $weight, $anchor, $location ) =
	    unpack("i i i i i", substr( $value, $i * 20, 20 ));
        #print "$key - $i $count, $id, $weight, $anchor, $location\n";
        $record->{'TOTAL'} += $count;
        $record->{'NDOCS'}  = $i+1;
        my %doc;
        $doc{'ID'}       = $id;
        $doc{'WEIGHT'}   = $weight;
        $doc{'ANCHOR'}   = $anchor;
        $doc{'LOCATION'} = $location;
        $record->{'DOCS'}[$i] = \%doc;
    }
    # Mark record as valid and exit
    $record->validity( 1 );
    return $record;
} # end arse_value();

## end of HtDig::DB::Word package



# ------------------------------------------------------------------------
# HtDig::DB::DocIndex: Interface to the doc_index database
# ------------------------------------------------------------------------

package HtDig::DB::DocIndex;
# Contain HtDig::DBRecord::DocIndex records
use vars qw(@ISA);
@HtDig::DB::DocIndex::ISA = qw(HtDig::DB);

##### Exported #####

### Sub new() ###
# new( );
###
sub new {
    my $class      = shift;
    # get parameters
    my %params     = @_;
    my $config     = $params{'Config'};
    if ( !defined $config ) {
        &HtDig::dbg( undef, 0, "DB::DocIndex: Parameter Config is mandatory" );
        return undef;
    }
    my $file = $params{'File'} || $config->{'doc_index'};
    $params{'File'} = $file;
    # create the self object
    my $self = $class->SUPER::new( %params );
    return undef unless ( defined $self );
    bless $self, $class;
    # inits
    $self->{'Config'} = $config;
    return $self;
} # end new();


##### Internals #####

sub _deserialize_data {
    my ( $self, $key, $value ) = @_;
    # reset record
    my $record = new HtDig::DBRecord::DocIndex( 'Key' => $key );
    # extract fields
    $record->id( $key );
    $record->url( $value );
    # Mark record as valid and exit
    $record->validity( 1 );
    return $record;
} # end arse_value();

## end of HtDig::DB::DocIndex package

1;
# HtDig.pm ends here  ------------------------------------------------
