#!/usr/bin/perl

#    PubCrawler
#
#    see http://acer.gen.tcd.ie/pubcrawler
#    or POD-text at end of file
#
my $version_number=0.95;  # 25 February 1999
#
#    Ken Wolfe, Karsten Hokamp
#    Department of Genetics, Trinity College Dublin
#
#    pubcrawler@acer.gen.tcd.ie

# LOCATION OF MODULES:
# In case you have Perl modules installed 
# in one of your own directories,
# edit the following line by adding any paths
# in which PERL should look for modules...
BEGIN {
    unshift(@INC, "$ENV{HOME}/lib", "$ENV{HOME}/PubCrawler/lib", 
	          '/path/to/module',);
      # there are three examplary paths provided:
      # - the first one ("$ENV{HOME}/lib") points to the directory
      #   'lib' below your HOME-directory (~/lib or $HOME/lib);
      #   this works only if the environmental variable $HOME is set;
      # - the second one follows the same syntax as the first
      # - the third path ('/path/to/module') is an example for any
      #   kind of UNIX-path starting from the root ('/')

      # ADD /current_working_directory/lib to @INC
      # (activated with option -add_path)
    if (grep /\Q-add_path/, @ARGV) {
	$tmp_file = "/tmp/pubcrawler_pwd.$$";
	system "pwd > $tmp_file";
	open (IN, "$tmp_file");
	chomp($cwd = <IN>);
	close IN;
	unlink $tmp_file;
	push @INC, "$cwd/lib";
	print STDERR "\nAdded path $cwd/lib to \@INC\n";
    }

      # LIBRARY TEST:
      # (activated with option -lib_test)
    if (grep /\Q-lib_test/, @ARGV) {
	print STDERR "\n          ***** PubCrawler - library test *****\n";
	print STDERR "\nThe following directories will be searched for modules:\n\n";
	foreach (@INC) {
	    print STDERR "$_\n";}
	exit;
    }
}				

#### STANDARD MODULES ####
use Getopt::Long;    # to read in command line options
use File::Basename;  # to parse file- and path-names
use File::Copy;      # to move files
use Cwd;             # to get the current working directory
use strict 'subs';   # to look for barewords

#### ADDITIONAL MODULES ####
use LWP::Simple;     # to retrieve proxy autoconfig-file                 
use HTML::Parser;    # to parse HTML-expressions              
use LWP::UserAgent;  # for advanced internet connections

###############################################################
####################  PROGRAM VARIABLES #######################
###############################################################

$| = 1;  # print to STDOUT immediately

$warn_stat = $^W;  # store status of warning-switch

$cwd = cwd;        # get current working directory

# I am just guessing here, that the following operating systems
# will work alright with the 'system'-variable set to 'unix'
@unix_flav = qw( aix dec_os dec_osf dynix epix esix freebsd genix hpux 
		irix isc linux lynxos machten mips mpc mpeix 
		netbsd nwsos next openbsd powerux qnx sco solaris 
		stellar sunos svr4 ti1500 titanos ultrix umips 
		unicos unisys unix utek uts );
$unix_flav = join '|', @unix_flav;

%db_match = ('pubmed', 'm', 'genbank', 'n');  # matches database key
                                              # to query-option 
%word = ('pubmed', 'PubMed', 'genbank', 'GenBank'); #matches database key
                                                    #to the real word
$known_searchtypes = '(pubmed|genbank)';   # all databases that we allow

# list of variables for which the program expects values:
@expect_val = qw( 
		 fullmax 
		 getmax 
		 html_file 
		 include_config 
		 relpubdate 
		 viewdays 
		 search_URL 
		 );

@allowed_var = qw(
		  fullmax 
		  getmax 
		  html_file 
		  include_config 
		  relpubdate 
		  viewdays 
		  search_URL 
		  work_dir
		  check
		  prompt
		  verbose
	          mute
	          log_file
		  lynx
		  header
		  prefix
		  system
		  proxy
		  proxy_port
		  proxy_auth
		  proxy_pass
		  time_out
		  test_URL
		  no_test		  
);


$EXIT_SUCCESS = 0; # return value for successful exit
$EXIT_FAILURE = 1; # return value for unsuccessful exit

# parse program name
($prog_name,undef,$suffix) = fileparse("$0",'\..*');
$program = $prog_name.$suffix;

# help-message:
$USAGE = "
          ***** PubCrawler - help message *****

usage: $program [-add_path -c <config_file>] [-check] [-d <directory>]   
       [-db <database>] [-fullmax <max-docs in full>]   
       [-getmax <max-docs to get>] [-h] [-help] [-head <output-header>]    
       [-i] [-l <log_file>] [-lynx <alternative-browser>] [-mute] [-no_test]   
       [-os <operating_system>] [-out <output-file>] [-p <proxy_server>]     
       [-pp <proxy_port>] [-pauth <proxy_authorization>]    
       [-ppass <proxy_password>] [-pre <prefix>] [-q <query_URL>]  
       [-relpubdate <relative-publication-date>] [-s <search-term]
       [-t <timeout>] [-u <test_URL>] [-v <verbose>] [-viewdays <view-days>]

options:
-add_path adds the path /cwd/lib to \@INC (list of library directories)
          where cwd stands for the current working directory
-c       configuration file for pubcrawler
-check   checks if program and additional files are setup correctly
-d       pubcrawler directory (configuration,databases,and output)
-db      name of database file
-fullmax maximum number of full length reports shown (per search)
-getmax  maximum number of documents to retrieve (per search)
-h       this help message
-head    HTML-header for output file
-help    same as -h
-i       include configuration file in HTML-output
-l       name of file for log-information
-lynx    command for alternative browser
-mute    suppresses messages to STDERR
-no_test skips the proxy-test
-os      operating system (some badly configured versions of Perl need  
         this to be set explicitly -> 'MacOS', 'Win', and 'Unix')
-out     name of file for HTML-output
-p       proxy
-pp      proxy port
-pauth   proxy authorization (user name)
-ppass   proxy password
-pre     prefix used for default file names (config-file,database,log)
-q       query URL
-relpubdate maximum age (relative date of publication in days) 
         of a document to be retrieved
         other valid entries: '1 year','2 years','5 years','10 years','no limit'
-s       search-term ('database#alias#query#')
-t       timeout (in seconds, defaults to 180)
-u       test-URL (to test proxy configuration)
-v       verbose output
-viewdays number of days each document will be shown

for more information see POD-text at the end of the script
or http://acer.gen.tcd.ie/pubcrawler

    ***** This was the PubCrawler - help message *****

";

#arrays and list-variables:
%age = ();         # age of each uid in days
%aliases = ();     # queries ordered by alias
%db = ();          # entries from database file
%hits = ();        # number of hits for each alias
@query_order = (); # order in which queries are specified
@alias_order = (); # order in which aliases are specified

# more variables:
$tool = "PubCrawler$version_number";    # make sure all blanks are substituted with '+'
$tool =~ s/\s+/\+/g;
$base_URL = '';          # base URL of output file
$break = '20';           # seconds of sleep between each request
$check = '';             # starts check-mode if set
$cmd_db = '';            # database for command-line query
$config_dir = '';        # directory in which the config-
                         # file resides
$config_file = '';       # location of the configuration file
$database = '';          # name of database-file
$dateline = '';          # headline of ouput with date
$db = '';                # temporary holder of database key
$fullmax = '';           # maximum number of documents to be
                         # shown as full reports
$getmax = '';            # maximum number of uids to be retrieved
$header = '';            # name of header file
$help = '';              # help-output if set
$hits = '';              # number of hits per search
$html_file = '';         # name of output file
$include_config = '';    # appends config to output if set
$joiner = '';            # OS-specific path-joiner
$log_file = '';          # name of log-file
$lynx = '';              # location of lynx program
$mute = '';              # suppresses printing to STDERR
$no_test = '';           # no proxy-test if set
$old_dir = '';           # temporary holder of directory name
$orig_system = '';       # original name given for OS
$prefix = '';            # prefix used for naming of default files
$prompt = '';            # prompts Mac-users for command line
                         # options if set
$prompted = '';          # set if Mac-users have been prompted
$proxy = '';             # proxy server
$proxy_port = '';        # port of proxy server
$proxy_auth = '';        # username for proxy authorization
$proxy_pass = '';        # password for proxy authorization
$proxy_string = '';      # readily configured proxy
$cmd_query = '';         # query entered from the command-line
$cmd_alias = '';         # alias for query entered from the command-line
$query_string = '';      # temporary holder of query string
$relpubdate = '';        # maximum relative date for documents
                         # to be retrieved
$result = '';            # result from each request
$result_collection = ''; # output text
$search_URL = '';        # URL to which search request is directed
$summary = '';           # indexed links to query results
                         # (at the beginning of output-file)
$system = '';            # operating system
$test_URL = '';          # URL used for testing internet connections
$timestamp = '';         # holds time of creation of output
$time_out = '';          # time in seconds that we'll wait
                         # before shutting down internet connections
$trailer = '';           # appendix to output
$viewdays = '';          # number of days documents will be shown
$verbose = '';           # writes log messages to STDOUT if set to 1
                         #writes log messages to log-file if set to 0
$work_dir = '';          # working directory

$error = 0;    #needed for -check option
@error = ();   #needed for -check option
$warning = 0;  #needed for -check option
@warning = (); #needed for -check option

# ask for command line options if running under MacOS
#if( $MacPerl::Version =~ /Application$/ and $prompt eq '1') {
    # we're running from the app
if (($system =~ /macos/i or $^O =~ /macos/i) and $prompt eq '1') {
    # we're running Macintosh and want to be prompted
    my( $cmdLine, @args );
    $cmdLine = &MacPerl::Ask( "Enter command line options (-h for help):" );
    require "shellwords.pl";
    @args = &shellwords( $cmdLine );
    unshift( @ARGV, @args );
    # make clear that we have been explicitely prompted:
    $prompted = '1';
}

#### fetch command line options ####
GetOptions('add_path', undef,  # this option is dealt with in BEGIN{}
	   'h', \$help,
	   'help', \$help,
	   'head=s',\$header,
	   'c=s', \$config_file,
	   'check', \$check,
	   'd=s', \$work_dir,
	   'db=s', \$database,
	   'fullmax=s', \$fullmax,
	   'getmax=i', \$getmax,
	   'i', \$include_config,
	   'l=s', \$log_file,
	   'lynx=s', \$lynx,
	   'mute', \$mute,
	   'no_test', \$no_test,
	   'os=s', \$system,
	   'out=s', \$html_file,
	   'p=s', \$proxy,
	   'pp=i', \$proxy_port,
	   'pauth=s', \$proxy_auth,
	   'ppass=s', \$proxy_pass,
	   'pre=s', \$prefix,
	   'relpubdate=s', \$relpubdate,
	   'rpd=i', \$relpubdate,
	   'q=s', \$search_URL,
	   's=s', \$cmd_query,
	   't=i', \$time_out,
	   'u=s', \$test_URL,
	   'v', \$verbose,
	   'verbose', \$verbose,
	   'viewdays=i', \$viewdays
	   );


if ($help) {
    print STDERR "$USAGE";
    exit($EXIT_SUCCESS);
}


# resolve '.' (current working directory)
$work_dir = $cwd if ($work_dir eq '.');

# default prefix is program name up to last dot
$prefix = $prog_name unless ($prefix);

# default configuration file consists of prefix + '.config'
$config_file = "$prefix.config" unless ($config_file);

$config_read = '';  # indicates if configuration file 
                    # has been read

# READ IN CONFIGURATION FILE 
# first try any specified working directory
if ($work_dir) {
    $old_dir = $cwd;
    chdir $work_dir;
    if (-r $config_file) {
	&read_config;
    } else {
	chdir $old_dir;
    }
}

unless ($config_read) {
        # try the home-directory
    if ($ENV{'HOME'}) {
	$old_dir = $cwd;
	chdir $ENV{'HOME'};
	if (-r $config_file) {
	    &read_config;
	} else {
            # read from current working directory
	    chdir $old_dir;
	    &read_config;
	}
    } elsif (-r $config_file) {
	# try the current working directory    
	&read_config;
    } else {
	# configuration file cannot be read
	# die unless all mandatory variables are set
	if (&empty_vars(@expect_val)) {
	    die "$prog_name ERROR: Can not read configuration file \'$config_file\'" unless ($check);
	} else {
	    warn "$prog_name WARNING: Can not read configuration file \'$config_file\'" unless ($check);
	}
    }
}

print STDERR "\nStarting PubCrawler...\n" unless ($mute);

# all mandatory variables are set now
# searches are stored in hash %query
# hash %aliases contains searches ordered by alias


# ask Mac-users for command line options again 
# (only in case the configuration file 
# that was just read in demands so...)
# -> gives a chance to overwrite or set new values
#    for selected variables
if (($system =~ /macos/i or $^O =~ /macos/i) and $prompt eq '1' and $prompted ne '1') {
    # we're running Macintosh and want to be prompted
    my( $cmdLine, @args );
    $cmdLine = &MacPerl::Ask( "Enter command line options (-h for help):" );
    require "shellwords.pl";
    @args = &shellwords( $cmdLine );
    unshift( @ARGV, @args );


    #### fetch command line options again####
    GetOptions('h', \$help,
	       'help', \$help,
	       'head=s',\$header,
	       'c=s', \$config_file,
	       'check', \$check,
	       'd=s', \$work_dir,
	       'db=s', \$database,
	       'fullmax=s', \$fullmax,
	       'getmax=i', \$getmax,
	       'i', \$include_config,
	       'l=s', \$log_file,
	       'lynx=s', \$lynx,
	       'mute', \$mute,
	       'no_test', \$no_test,
	       'os=s', \$system,
	       'out=s', \$html_file,
	       'p=s', \$proxy,
	       'pp=i', \$proxy_port,
	       'pauth=s', \$proxy_auth,
	       'ppass=s', \$proxy_pass,
	       'pre=s', \$prefix,
	       'relpubdate=s', \$relpubdate,
	       'rpd=i', \$relpubdate,
	       'q=s', \$search_URL,
	       's=s', \$cmd_query,
	       't=i', \$time_out,
	       'u=s', \$test_URL,
	       'v', \$verbose,
	       'verbose', \$verbose,
	       'viewdays=i', \$viewdays
	       );

    if ($help) {
	print STDERR "$USAGE";
	exit($EXIT_SUCCESS);
    }
    
    # resolve '.' (current working directory)
    $work_dir = cwd if ($work_dir eq '.');
    
    # default prefix is program name up to last dot
    $prefix = $prog_name unless ($prefix);
}

# use cwd as working directory
# if $work_dir has no value
unless ($work_dir) {
    $work_dir=$cwd;
    printf STDERR "\n\n** $prog_name: using $work_dir as working directory! **\n\n" unless ($check or $mute);
}

# SET SYSTEM DEPENDENCIES
$system = $^O unless ($system);
$orig_system = $system;
$system = 'macos' if ($system =~ /^mac|macintosh$/i);
$system = 'unix' if (grep /$system/i, @unix_flav);
#(this program doesn't distinguish between the 
# different UNIX-flavours) 

# configure system-dependent joiner for path names
if ($system =~ /macos/i) {
    $joiner = ':';
}elsif ($system =~ /unix/i){
    $joiner = '/';
}elsif ($system =~ /win/i) {
    $joiner = '\\';
}

$work_dir .= $joiner unless ($work_dir =~ /$joiner$/);

# special treatment of variable 'relpubdate'
if ($relpubdate =~ /no limit/i) {
    $relpubdate = 100000;
    $entrez_relpubdate = 'no+limit';
} elsif ($relpubdate =~ /^\s*(\d+)\s*(\D+)/) {
    $relpubdate = $1;
    $date_type = $2;
    if ($date_type =~ /^y/i) {
	$entrez_relpubdate = $relpubdate.'+years';
	$relpubdate *= 365;
    } else {
	$entrez_relpubdate = $relpubdate.'+days';
    }
} else {
    $entrez_relpubdate = $relpubdate.'+days';
}

##################################
#### END OF VARIABLE SETTING #####
##################################

&check_setting if ($check);

#### check that all mandatory fields have values ####
####        and write them to the log file       ####
if (@_ = &empty_vars(@expect_val,'system')) {
    print STDERR "$prog_name ERROR: no value set for the following variable(s):\n";
    foreach (@_) {
	print STDERR "\t$_\n";
    }
    print STDERR "\nPlease check your configuration file or use command line options!\n";
    exit($EXIT_FAILURE);
}

# try to change to working directory
chdir "$work_dir" or 
    die "$prog_name ERROR: Can not change to working directory \'$work_dir\'";

#### open log file ####
unless ($verbose) {        
    $log_file = "${prefix}_log.html" unless ($log_file);
    open (LOGFILE,">$log_file") ||
	die "$prog_name ERROR:cannot open log file ($log_file):$!";
    select (LOGFILE);
    print "<HTML><title>PubCrawler log file</title>
          <h2>PubCrawler logfile</h2><pre>";
}

print "config file is $config_file in $work_dir\n\n";


# configure and test proxy settings
# if value for proxy is specified
&proxy_setting if ($proxy);

# overwrite queries from config-file
# if query was given on the command-line:
if ($cmd_query) {
    ($cmd_db, $cmd_alias, $cmd_query) = split /#/, $cmd_query;
    $cmd_query = $cmd_alias unless ($cmd_query);
    @query_order = ($cmd_query);
    @alias_order = ($cmd_alias);
    %aliases = ();
    %query = ();
    push @{ $aliases{$cmd_alias} }, $cmd_query;
    $query{$cmd_query}{'ALIAS'} = $cmd_alias;	    
    $query{$cmd_query}{'DB'} = $cmd_db;
}

# list all searches for log:
print "\n";
foreach (@allowed_var) {print " $_ : ${$_}\n";}
print "\n searches:\n";
foreach (@query_order) { print "\t$_ at $query{$_}{'DB'}\n";}
print "\n changing dir to $work_dir \n\n";


#### prepare output HTML file with header and trailer ####
if (-e $html_file) {unlink $html_file;}

#begin writing to output file:
open (OUT,">$html_file") ||
    die "$prog_name ERROR: could not open output HTML file $html_file\n";

#header and trailer: copy header info if a header file exists:
$header = "$prefix.header" unless ($header);
if (-e $header) {
    open(HEADER,"$header") || 
	die "$prog_name ERROR: $header exists but cannot be opened.\n";
    while (<HEADER>) {print OUT $_;}
    close HEADER;
    print "\nwriting header data from $header to $html_file\n";
}else{
    print OUT "<HTML><HEAD><TITLE>$orig_system PubCrawler</TITLE></HEAD><BODY BGCOLOR=\"#f0f0f0\">\n";
    print "\nno header file; writing default header to $html_file\n";
}
($dateline, $timestamp) = &timestamp;
$trailer = &trailer; #make trailer (bottom of HTML output page) 
                     #with config file appended if requested

print OUT "$dateline\n";


######################################
#########  read in database  #########
######################################
&read_db;


##################################################
#### make first visit to NCBI to get all UIDs ####
##################################################
foreach $query (@query_order) {

    %age = (); # values of age will be filled in sub first_visit
    $db = $db_match{$query{$query}{'DB'}};

    # first visit to NCBI for each query to get UIDs
    ($hits,
     $err_msg,
     @uid_list) = &first_visit($db,$query);

        # space requests at $break seconds interval...
    unless ($query eq $query_order[-1]) {
        print "sleeping for $break seconds...\n";
        sleep($break);
    }
    
    # add results to hash %query
    $query{$query}{'HITS'} = $hits;
    $query{$query}{'ERR'} = $err_msg;
    @{ $query{$query}{'UIDS'} } = @uid_list;
}


######## create base URL for internal links ########
$base_URL = $work_dir.($work_dir =~ /$joiner$/ ? '':"$joiner").$html_file;
if ($joiner eq ':') {
    $base_URL =~ s#\:#\/#g;
} elsif ($joiner eq '\\') {
    $base_URL =~ s#\\#\/#g;
}
$base_URL = 'file:'.($base_URL =~ /^\//?'':'/').$base_URL;


###################################################
############ make second visit to NCBI ############
##### to get full reports of interesting UIDs #####
###################################################
$warn_text = "<BR>\n-&gt;&nbsp;Increase value of <I>getmax</I> (now $getmax) via command-line option or in configuration file for more results.<BR>\n";    

foreach $alias (@alias_order) {
    $getmax_warning = '';
    @uid_list = ();
    $summary = '<UL>';
    $db = '';
    $word = '';
    $query = '';

    # combine retrieved uids from each query
    # that has the same alias:
    foreach $query ( @{ $aliases{$alias} } ) {
	($query_string) = split /#/, $query;
	if ($query{$query}{'ERR'}) {
	    $summary .= "<LI>No hits for $query_string\n<BR><B>Message from NCBI:</B> ".$query{$query}{'ERR'}."</LI>\n";
	    $word = $word{$query{$query}{'DB'}} unless ($word);
	} else {
	    if ($query{$query}{'HITS'} >= $getmax) {
		$getmax_warning .= "<BR><B>Warning:</B> retrieved max number of docs for this query!\n";
	        $summary .= "<LI>$query{$query}{'HITS'} hit".($query{$query}{'HITS'}==1?'':'s')." after <B>first</B> visit for $query_string$getmax_warning$warn_text</LI>\n";
	    } else {
	        $summary .= "<LI>$query{$query}{'HITS'} hit".($query{$query}{'HITS'}==1?'':'s')." after <B>first</B> visit for $query_string</LI>\n";
	    }
	    push @uid_list, @{ $query{$query}{'UIDS'} };
	    $db = $db_match{$query{$query}{'DB'}} unless ($db);
	    $word = $word{$query{$query}{'DB'}} unless ($word);
	}
    }
    if ($getmax_warning) {
	$getmax_warning .= $warn_text;
    }
    $summary .= "</UL>\n";
#    $result = "<HR><H1><A NAME=\"$alias\">Results for \'$alias\' at $word</A></H1>\n$summary<BR>\n";
    $result = "<HR><H1><A NAME=\"$alias\">Results for \'$alias\' at $word</A></H1>\n";

    if (@uid_list) {	
	   # extract all suitable uids
	@uid_list = &list_crunch($alias,@uid_list);
	   # @uid_list could be empty now
	   # (if all entries are older than viewdays + 3)
	   # but &second_visit will deal with this...
	print "\n=====\nmaking second HTTP connection to retrieve complete records.\n";
	   # retrieve full records and
           # space requests at $break seconds interval...
        print "sleeping for $break seconds...\n";
        sleep($break);
	$result_tmp = &second_visit($alias,$db,$word,@uid_list);
	if ($hits{$alias} > 0) {
	        # print out a summary of results from first visit
	        # only if some first-day-hits were retrieved
	    if ($result_tmp =~ m#(<b>No new records for .* today</b><br>)#) {
		$result_tmp =~ s#(<b>No new records for .* today</b><br>)#$1$getmax_warning#;
	    }
	    $result .= "${summary}<BR>\n$result_tmp";
	} else {
	    if ($result_tmp =~ m#(<b>No new records for .* today</b><br>)#) {
		$result_tmp =~ s#(<b>No new records for .* today</b><br>)#$1$getmax_warning#;
		$result .= "$result_tmp";
	    } else {
		$result .= $getmax_warning.$result_tmp;
	    }
	}
    } else {
	if ($getmax_warning) {
	    $result .= "No new records retrieved for \'$alias\'.$getmax_warning";
	} elsif ($query{@{ $aliases{$alias} }[0]}{'ERR'} =~ /No Documents Found/) {
	    $entrez_date = $entrez_relpubdate;
	    $entrez_date =~ s/\+/ /g;
	    $result .= "No documents found that were published in the last <B>$entrez_date</B>.<BR>";
	} else {
	    $result .= "No new records retrieved for \'$alias\'. ".'<B>Possible network failure.</B>';
	}
	$result .= "Database for this entry not updated.<BR>\n";
    }
    ### write search results and error messages to output file:
    $result_collection .= "$result<BR>\n<A HREF=\"$base_URL#TOP\">Back to top</A>\n";
	# base URL is not suitable for access of result via WWW
#    $result_collection .= "$result<BR>\n<A HREF=\"#TOP\">Back to top</A>\n";
}

# print index:
print OUT "<H1>Index of PubCrawler results:</H1>\n<UL>";
$^W = 0; # disable warning
#( I know that 'no' isn't numeric)
foreach $alias (@alias_order) {
    $hits = $hits{$alias};
    $hits = 'no' if ($hits == 0);
    if ($hits == 1) {
	$hits .= " new hit";
    } else {
	$hits .= " new hits";
    }
#    print OUT "<LI><A HREF=\"$base_URL#$alias\">$alias</A>: $hits today</LI>\n";
	# base URL is not suitable for access of result via WWW
    print OUT "<LI><A HREF=\"#$alias\">$alias</A>: $hits today</LI>\n";
}
$^W = $warn_stat;  # set back warning status
print OUT "</UL>\n";

	#base href info is provided by machine jupiter 
        #but not machine callisto:
print OUT "\n".'<BASE HREF="http://www.ncbi.nlm.nih.gov">';  


#print rest of document:
print OUT $result_collection.$trailer;

# set permissions for output file
if ($system =~ /macos/i) {
  MacPerl::SetFileInfo('MOSS','TEXT',"$log_file","$html_file");
}

if ($system =~ /unix/i){
    # make it readable (in case umask is messed up)
    chmod (0644,"$html_file");
}

######################################
##########  save database  ###########
######################################
&save_db;

print "Finished.\n";
print STDERR "Finished.\n" unless ($verbose or $mute);

close OUT;
close LOGFILE;

exit($EXIT_SUCCESS);

##########################################################################
################################# END OF MAIN ############################
##########################################################################


##########################################################################
################################# SUBROUTINES ############################
##########################################################################

#  FIRST_VISIT
sub first_visit{ 
#first visit to NCBI to get list of UIDs matching query
# called from MAIN
# (this sub is called once for each query)
# requires $getmax(global), $relpubdate(global), database, query-string
#initialise:
    local $search_type = shift; # database for query
    local $query = shift;       # query-string
    local @uids=();  # list that holds the uids
    local @tmp = (); # temporary array
    local $query_error_msg = '';
    local $uid_result = '';
    local $docstring = '';  
    local $hits = '';
    
    #remove comments from query
    ($query) = split /#/, $query;
    # assemble docstring:
    $docstring = join '&', ("db=$search_type",
			    'form=4',
			    'Dopt=u',
			    'title=no',
			    'dispmax='.$getmax,
			    "relentrezdate=$entrez_relpubdate",
			    "tool=$tool",
			    "term=$query");
    
    print "\n===\nmaking HTTP connection to: $search_URL\n" ;
    print "with this query: $docstring\n";
    
    # get the results of the HTTP-request
    # requires: $search_URL(global), $docstring   
    # produces: $uid_result
    $uid_result = &make_HTTP($docstring);  
    
    # remove HTML tags from retrieved list of UIDs 
    # and split it on white space
    $uid_result =~ s/<.*?>//g;
    print "result of query is: $uid_result";
    
    #check for text (error message) instead of UID numbers in result:    
    if ($uid_result =~ /[a-zA-Z]/ or $uid_result !~ /\d/){
	$hits = 0;
	$query_error_msg = "$uid_result from search: ".$query."<br>";
	print "got TEXT instead of UIDs from the query.\n\n";
    } else {
	@tmp=split(/\s+/,$uid_result);   # @tmp is the list of 
	                          # matching UIDs for this query
	foreach (@tmp) {
	    # drop everything that doesn't contain a digit
	    if (/\d/) {push @uids, $_;}
	}
	$hits = $#uids + 1; 
	print "UIDs from this query are: @uids \n\n";
    }
    return($hits,$query_error_msg,@uids);
}#return from sub first_visit
    
    
    
##########################################################################
#  LIST_CRUNCH
#subroutine to compare the UIDs (returned from a first search) 
#to the databases and retain only the new-ish ones
#called from MAIN
sub list_crunch{
    local $alias = shift;
    local @uid_list = @_;
    local @list_shrunk=();
    local $prev;
    local $lifespan;
    local $forget;
    local $now;
    local @goget=();
    
#strip extra characters from UIDlist
    foreach (@uid_list) { s/\s//g; s/\n//g; }
    
#cumbersome way to remove duplicates; 
#list_shrunk has no duplicate entries
    $prev="";
    foreach (reverse sort(@uid_list)) {
	unless ($_ eq $prev) {push(@list_shrunk,$_);}
	$prev=$_;
    }
    @uid_list=@list_shrunk;
    undef @list_shrunk;
    
# Log file contains UIDs and the time (Unix, Win, or Mac time stamp) 
# they were first seen.
# Can forget old log entries when they are older than $relpubdate.
# The factor +3 is to guard against any time/date differences
# as compared to NCBI's RELPUBDATE clock
# (need to keep entries in the database for a little longer 
# than RELPUBDATE days)
    
    $lifespan=$viewdays +3;  #maximum age (in days) for a record 
                                      #to be displayed
    $forget=$relpubdate +3;  #age above which a record can be 
                                      #deleted from the database
#    $now=time/86400;      #unix and Macintosh time baselines are different 
                          #but both work in seconds
    $now = sprintf "%.2f",(time/86400); # round value ...
    print "now: $now; max age allowed (lifespan): $lifespan; ";
    print "age to forget: $forget \n";
    
# compare the UIDs in the search-results list to those in the database
# go-get them / ignore them / delete from database / 
# add to database as appropriate
# note: time_first_seen{} is local to this subroutine; age{} is global
    foreach $uid (@uid_list){
	next unless ($uid =~ /\d/);
	unless (${ $db{$alias} }{$uid}) { ${ $db{$alias} }{$uid} = $now; }
	$age{$uid} = $now - ${ $db{$alias} }{$uid};
	print "uid:$uid time_first_seen:${ $db{$alias} }{$uid} age:$age{$uid} ";
	if ($age{$uid} <= $lifespan) {
	    push(@goget,$uid);  #goget is the list of UIDs to get 
	                        #on the 2nd visits
	    print "\n";
        }else{
	    print "is too old.\n";
        }
	if ($age{$uid} > $forget) {
	    print "forgetting it.\n";
	    delete ${ $db{$alias} }{$uid}; #avoid huge database files
	}
    }

    $^W = 0; # disable warning
    @goget = sort by_time_first_seen_and_uid @goget;
    $^W = $warn_stat;  # set back warning status
    
    return(@goget);
} ## return from sub list_crunch



################################################################################
# SECOND_VISIT
# subroutine to make second visit to PubMed or Genbank
# requires: $alias $search_type database-name @query_list $fullmax(global) 
#           $viewdays(global) %age (from &list_crunch) $search_URL(global
# returns: $second_visit_result
sub second_visit {
	    
    local $alias = shift;
    local $search_type = shift;
    local $word = shift;
    local @query_list = @_;
    local $docstring;
    local $number_of_1day_records;
    local @query_list_1day;
    local $day;
    local $uidstring;
    local @query_big;
    local @query_tmp;
    local $counter;
    local $second_visit_result = '';
    
#retrieve UIDs in $query_list in single-day age groups, 
#between age 0 and age $viewdays:
    for ($day=0; $day <= $viewdays; $day++){
        $button = '';
	@query_big = ();
	$counter = $fullmax + 1;
	foreach (@query_list){
	    if ($day == int(($age{$_} + 0.5))) { #0.5 term is to prevent age of
		                               #0.99 days being rounded down,etc
		push(@query_big,$_);
	    }
	}
	# shorten query list if too long...
	@query_list_1day = splice(@query_big, 0, $fullmax);

	$uidstring=join(',', @query_list_1day);
	
	# assemble docstring:
	$docstring = join '&', ("db=$search_type",
				'form=6',
				'Dopt=d',
				'title=no',
				'dispmax='.$fullmax,
				"tool=$tool",
				"uidstring=$uidstring");

	print "$day days old:  $uidstring\n";

	if ($day == 0){
	    #zero-day-old records: retrieve them
	    $hits{$alias} = $#query_big + $#query_list_1day + 2;
	    unless ($#query_list_1day == -1){
		# get the results of the HTTP-request
		# requires: $search_URL(global) $docstring   
		# produces: $uid_result
		$second_visit_result .= "<h2>Today\'s new results ($hits{$alias} citation".($hits{$alias} > 1 ? 's' : '')." in total):</h2>\n";
		print "retrieving full reports from NCBI...\n";
		$second_visit_result .= &make_HTTP($docstring); 
	    }else{
		$second_visit_result .= "<b>No new records for \'$alias\' today</b><br>\n";
	    }
	}else{
	    #older records: make a hypertext link to them, if they exist
	    unless ($#query_list_1day == -1){
		$number_of_1day_records = $#query_list_1day + 1;
		$button="<br>MORE: $day-day-old records for \'$alias\'  <A HREF=\"$search_URL?$docstring\">(".($#query_big >= 0?"1 - ":"")."$number_of_1day_records)</A> ";
		$button =~ s/ /\&nbsp;/g;
		$button =~ s/<A\&nbsp;HREF/<A HREF/g;
		$second_visit_result .= $button;
	    }
	}
	# create buttons for excessive entries
	while (@query_big) {
	    @query_tmp = splice(@query_big, 0, $fullmax);
	    $uidstring=join(',', @query_tmp);
	    $docstring = join '&', ("db=$search_type",
				    'form=6',
				    'Dopt=d',
				    'title=no',
				    'dispmax='.$fullmax,
				    "tool=$tool",
				    "uidstring=$uidstring");
	    $number_of_1day_records = $counter + $#query_tmp;
	        # create a link for excessive reports...
	    if ($button) {
	        $button = "  <A HREF=\"$search_URL?$docstring\">($counter - $number_of_1day_records)</A>";
	    } else {
	        $button = "<br>MORE: $day-day-old records for \'$alias\'  <A HREF=\"$search_URL?$docstring\">($counter - $number_of_1day_records)</A>";
	    }
	    $button =~ s/ /\&nbsp;/g;
	    $button =~ s/<A\&nbsp;HREF/<A HREF/g;
	    $second_visit_result .= $button;
	    $counter = $number_of_1day_records + 1;
	}	
    }
    return ($second_visit_result);
}#end sub second_visit


################################################################################
# SORT_BY_TIME_FIRST_SEEN_AND_UID
# subroutine to sort UIDs into increasing age order in output:
# sort by (1) recent time_first_seen and then (2) high UID number
sub by_time_first_seen_and_uid {
    if ( ${ $db{$query} }{$a} > ${ $db{$query} }{$b} ) {
        return -1;
    }elsif ( ${ $db{$query} }{$a} < ${ $db{$query} }{$b} ) {
        return 1;
    }elsif ( ${ $db{$query} }{$a} == ${ $db{$query} }{$b} ) {
	if ($a < $b) {
	    return 1;
	}elsif ($a==$b) {
	    0;
	}elsif ($a > $b) {
	    return -1;
	}
    }
}#return from sub by_time_first_seen_and_uid
    

################################################################################
# TRAILER
# subroutine to make the trailer for the HTML file
# with a copy the config file (if include_config was YES in the config file)
# called by MAIN
# requires: $timestamp $include_config $config_file $prog_name 
# returns: $trailer
sub trailer {
    local $trailer="<hr><small>generated by <a
href=\"http://acer.gen.tcd.ie/pubcrawler\">PubCrawler version
$version_number</a> on ";
    $trailer .= $timestamp."</small>\n";
    if ($include_config =~ /^y|1/i ) {
	    # append configuration file:
	$trailer .= "<HR><PRE>PubCrawler configuration file (<B>$config_file</B>) reads:<BR>\n";
	open (CONFIG,"$config_file") ||
	    die "$prog_name ERROR: cannot open configuration file $config_file";
	while (<CONFIG>){
	    $trailer .= "$_<br>";
	}
	close (CONFIG);
    }
    $trailer .= "</BODY></HTML>";
    return ($trailer);
}#return from sub trailer


################################################################################
# TIMESTAMP
#subroutine to make timestamp from localtime function
#called by MAIN
#requires: nothing   returns: $dateline $timestamp  
# macchange: is any of this Mac specific?
sub timestamp{
    local ($sec,$min,$hour,$mday,$mon,$year,$wday)=localtime(time);
    foreach ($hour, $min, $sec){
	if (length($_) == 1) {
	    $_ = "0".$_;
	}
    }
          # match number of day to explicit name
    local $dayname = 
	('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday')[$wday];
          # match number of month to explicit name
    local $monthname =
	('Jan','Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec')[$mon];
        # change year to four-digit style
    $year+=1900;
        # return dateline and timestamp
    return("<H1><CENTER><A id=\"TOP\">$dayname $mday $monthname $year</A></CENTER></H1>", 
	   "$dayname $mday $monthname $year at $hour:$min:$sec");
}#return from sub timestamp
    
    
################################################################################
# MAKE_HTTP
#subroutine to make HTTP connections
#called by &first_visit and &second_visit
#requires: $search_URL $docstring   produces: $result
sub make_HTTP {
#requires: $search_URL $docstring   produces: $result
#print "http-- search_URL: $search_URL \n";
#print "http-- docstring: $docstring \n";
    local $request_string = shift;
    local $rescontent;

    if ($lynx) {
	# make internet-connection through 
	# alternative command-line browser:
	unless ($system =~ /unix/i) {
	    die "$prog_name ERROR: system call for \'Lynx\' only works for unix";
	} else {
	    $request_string = $search_URL.'?'.$request_string;
	    $rescontent = `$lynx -source \'$request_string\'`;
	    return $rescontent;
	}	
    }
        # initialize user agent
    local $ua = new LWP::UserAgent;
        # set time out
    $ua->timeout($time_out);
        # set proxy
    if ($proxy_string) {
	$ua->proxy(http  => "$proxy_string");
    }

        # initialize request
    local $req = new HTTP::Request 'POST',"$search_URL";
    $req->content_type('application/x-www-form-urlencoded');
    $req->proxy_authorization_basic("$proxy_auth", "$proxy_pass") 
	if ($proxy_pass and $proxy_auth);
    $req->content($request_string);
   
        # get result of request
    local $res = $ua->request($req);
    if ($res->is_success) {
	#print "\n res content \n";
	#print $res->content;
    } else {

	print "No reply to HTTP-request - bad luck this time\n";
    }

    $rescontent = $res->content;
    #macchange CRLF
    $rescontent =~ s/[\015\012]/\015/g if ($system =~ /macos/i);

    return($rescontent);
}#end of sub make_HTTP
    
#############################################################
    
sub connection_test{
	# tests if test_URL can be reached through proxy...
    local $proxy = shift;
    local $url = shift;
    local $ua = new LWP::UserAgent;
    
    $ua->timeout($time_out);
    $ua->proxy(http  => "$proxy") unless ($proxy eq 'no_proxy');
    
    local $req = new HTTP::Request 'GET',"$url";
    
    $req->proxy_authorization_basic("$proxy_auth", "$proxy_pass") 
	if ($proxy_pass and $proxy_auth and ($proxy ne 'no_proxy'));
    local $res = $ua->request($req);
    if ($res->is_success) {
	return 1;
    } else {
	return 0;
    }
}
sub proxy_setting{    
	# configure proxy and check if internet access provided...
    local $check_mode = shift;
    local @proxy_tried = ();  #list of unsuccessful proxy configs
    local $proxy_config = ''; #address for proxy auto configuration
    local @proxy_config = (); #return from pac-address

    return if ($lynx);

    unless ($no_test) {
	if ($check_mode) {
	    print STDERR "\n\t - checking internet access through proxy...";
	} else {
	    print STDERR "\nTesting internet access through proxy...\n" unless ($mute);
	}
    }
    # if the address of a proxy-configuration file is given
    # retrieve content to configure proxy settings...
    # (this is detected by a slash somewhere BEFORE the end of the string)
    if ($proxy =~ /\/\w*/) {
	@proxy_tried = ();
	$proxy_conf = ($proxy =~ /^http:\/\//)?'':'http://' ;
	$proxy_conf .= $proxy;
	if ($proxy_port) {
	    $proxy_conf .= ($proxy_port =~ /:(\d+)/)?$proxy_port:":$1";
	}
            # retrieve configuration data
	@proxy_config = split /\n/, (get "$proxy_conf"); 
	# for more info on proxy auto configuration (Netscape) look up
	# http://developer.netscape.com/docs/manuals/proxy/adminux/autoconf.htm

	foreach (@proxy_config) {
	    local @tmp;
	    # extract proxy autoconfig information
	    next unless (/PROXY/);
	    (undef,@tmp) = split /PROXY/;
	        # if keyword 'PROXY' found...
	    foreach (@tmp) {
		local $server_test;
		local $port_test;
		    # ... check the next word for a pattern
		    # that looks like server (+port) specification
		if (/((\w+\.)+\w+)(:\d+)?/) {
		    $server_test = $1;
		    $port_test = $3;
		    # test connection to proxy...
		    $proxy_string = 'http://'.$server_test.$port_test.'/';
		    if (&connection_test($proxy_string,$test_URL)
			or $test_URL eq '') {
			last; # found a working proxy -> exiting test
			      # (or can't test it)
		    } else {
			push @proxy_tried, $proxy_string;
			$proxy_string = '';
		    }
		}
		last if ($proxy_string);
	    }
	    last if ($proxy_string);
	}
	return if ($no_test);
	if ($proxy_string) {
	    if ($check_mode) {
		unless ($test_URL) {
		    print STDERR "no test-URL available!\n";
		} else {
		    print STDERR "alright\n";
		}
	    } elsif (! $mute) {
		unless ($test_URL) {
		    print STDERR "No test-URL available!\n";
		    print STDERR "Proxy setting: $proxy_string\n";
		} else {
		    print STDERR "Successfully received the test URL!\n";
		    print STDERR "Internet access through proxy ($proxy_string) seems o.k.\n";
		}
		    print STDERR "Continuing with program...\n";
	    }
	} else {
	    if ($check_mode) {
		$error++;
		print STDERR "\n\t   ERROR $error: Can\'t configure proxy!\n";
		push @error, "Problems encountered when trying to access the test-URL.\n\tPlease check the test-URL:\n\t\'$test_URL\'\n\tand your proxy settings (command line or ".(&line_number($config_file,'^proxy\b')).')'.($proxy?":\n\t\'$proxy\'".($proxy_port?", \'$proxy_port\'":"")." evaluated to \'$proxy_string\'":"");
	    } else {
		print STDERR "\n\nERROR:\n";
		print STDERR "Could not configure proxy from \'$proxy\'\n";
		foreach (@proxy_tried) {
		    print STDERR "Tried proxy $_ without success\n";
		}
		print STDERR "Please check your proxy configuration or the test URL\n($test_URL)\n";
		exit $EXIT_FAILURE;
	    }
	}
    } else {
	# test if given proxy server (and port) is working...
	$proxy_string = 'http://' unless ($proxy =~ /^http:\/\//);
	$proxy_string .= $proxy;
	if ($proxy_port) {
	    $proxy_string .= ($proxy_port =~ /:(\d+)/)?$proxy_port:":$proxy_port";
	}
	$proxy_string .= '/' unless ($proxy_string =~ /\/$/);
	return if ($no_test);
	if ($test_URL eq '') {
	    if ($check_mode) {
		print STDERR "no test-URL available!\n";
	    } elsif (! $mute) {
		print STDERR "No test-URL available!\n";
		print STDERR "Proxy setting: $proxy_string\n";
		print STDERR "Continuing with program...\n";
	    }
	} elsif (&connection_test($proxy_string,$test_URL)) {
	    if ($check_mode) {
		print STDERR "alright\n";
	    } elsif (! $mute) {
		print STDERR "Successfully received the test URL!\n";
		print STDERR "Internet access through proxy ($proxy_string) seems o.k.\n";
		print STDERR "Continuing with program...\n";
	    }
	} else {
	    if ($check_mode) {
		$error++;
		print STDERR "\n\t   ERROR $error: Can\'t reach test-URL!\n";
		push @error, "Problems encountered when trying to access the test-URL.\n\tPlease check the test-URL:\n\t\'$test_URL\'\n\tand your proxy settings (command line or ".(&line_number($config_file,'^proxy\b')).")".($proxy?":\n\t\'$proxy\'".($proxy_port?", \'$proxy_port\'":"")." evaluated to \'$proxy_string\'":"");
	    } else {
		print STDERR "\n\nERROR:\n";
		print STDERR "Problems with proxy ($proxy_string) encountered:\n";
		print STDERR "Please check your proxy entries or the test URL\n($test_URL)\n";
		exit $EXIT_FAILURE;
	    }
	}
    }
}

################################################################################
#  CHECK_SETTING
sub check_setting {
    print "
$prog_name will check the setting of variables and the configuration 
of additional files that are important for the execution of the program.
Any errors or warnings encountered are marked as such.
At the end recommondations are given to solve any problems.

Please press <return> to continue...
";

    <>;

    local $db_file;
    local $orig_dir;
    local $field;
    local $pause = 1;
    local $dir_ok = '';
    local $val;
    local $mk_dir_rec = "The working directory you specified ('$work_dir') does not exist.\n\tPlease create the directory\n\t(on Unix: mkdir $work_dir)\n\tor specify a different directory either through the command line option \'-d\'\n\te.g. $program -d <dir>\n\tor ".(&line_number($config_file,'^\s*#*\s*work_dir\b'))."\n\te.g. work_dir /home/user/$prog_name";
    local $write_dir_rec = "Your working directory '$work_dir' is not writeable.\n\tPlease change the permissions\n\t(on Unix: chmod +w $work_dir)\n\tor specify a different directory either through the command line option \'-d\'\n\te.g. $program -d <dir>\n\tor ".(&line_number($config_file,'^\s*#*\s*work_dir\b'))."\n\te.g. work_dir /home/user/$prog_name";
    local $read_dir_rec = "Your working directory '$work_dir' is not readable.\n\tPlease change the permissions\n\t(on Unix: chmod +r $work_dir)\n\tor specify a different directory either through the command line option \'-d\'\n\te.g. $program -d <dir>\n\tor ".(&line_number($config_file,'^\s*#*\s*work_dir\b'))."\n\te.g. work_dir /home/user/$prog_name";
    local $system_rec = "A bad value for your \$system-variable has been detected.\n\tPlease use the command line option '-os' to specify one of 'macos','win','unix'\n\t(whatever comes closest to your operating system)\n\te.g. $program -os \'mac\'\n\tor set the value manually ".(&line_number($config_file,'^\s*#*\s*system\b'))."\n\te.g. system mac";
    local $header_rec = "The header-file you specified (\'$header\') could not be found or read.\n\tPlease make sure that the path is specified correctly\n\t- either via command line option\n\t  e.g. $program -head \'/home/user/header.file\'\n\t- or ".(&line_number($config_file,'^\s*#*\s*header\b'))."\n\t  e.g. header /home/user/header.file\n\tand that the file is readable\n\t(under Unix: chmod +r $header)";
    local $read_config_rec = "Your configuration file '$config_file' is not readable.\n\tPlease change the permissions\n\t(on Unix: chmod +r $config_file)\n\tor specify a different file either through the command line option \'-c\'\n\te.g. $program -c <config_file>\n\tor ".(&line_number($config_file,'^\s*#*\s*config_file\b'))."\n\te.g. config_file /home/user/$prog_name.config";
    print "Start checking...\n";
	
    # OPERATING SYSTEM:
    if ($system =~ /win|macos|$unix_flav/i) {
	print " - operating system defined as \'$orig_system\', alright\n";
    } else {
	$error++;
	print "   ERROR $error: bad value for variable \$system: \'$orig_system\'!\n";
	push (@error, $system_rec);
    }

    # WORKING DIRECTORY:
    unless ($work_dir) {
	print " - no working directory set, using the current directory, alright\n";
	$work_dir=$cwd;
    }
    print " - checking your working directory: \'$work_dir\'...\n";
    sleep($pause);
    if (! -e $work_dir) {
	$error++;
	print "\t   ERROR $error: Working directory \'$work_dir\' does not exist!\n";
	push (@error, $mk_dir_rec);	
    } elsif (!-w $work_dir) {
	$error++;
	print "\t   ERROR $error: Can not write to working directory!\n";
	push (@error, $write_dir_rec);	    
    } elsif (!-r $work_dir) {
	$error++;
	print "\t   ERROR $error: Can not read from working directory!\n";
	push (@error, $read_dir_rec);	    	    
    } else {
	print "\t - \'$work_dir\' is fully accessible, alright\n";
	$dir_ok = '1';
    }

    # CONFIGURATION FILE:
    print " - checking your configuration file \'$config_file\'...\n";
    sleep($pause);
    if ($config_dir) {
	print "\t - configuration file resides in $config_dir and is readable, alright\n";
    } else {
        # first look in the working directory
	$orig_dir = $cwd;
	chdir($work_dir);
	if (-e "$config_file") {
	    if (-r "$config_file") {
		print "\t - configuration file accessible from your working directory, alright\n";
	    } else {
		if (@_ = &empty_vars(@expect_val)) {
		    $error++;
		    print "\t   ERROR $error: Can not read your configuration file \'$config_file\' in $work_dir!\n";
		    push (@error, $read_config_rec);	
		} else {
		    print "\t   WARNING: mandatory variables are set but no configuration file could be read in $work_dir!\n";
		    $warning++;
		}   
	    } 			    
	} else {
	    chdir($orig_dir);
	    if (-e "$config_file") {
		if (-r "$config_file") {
		    print "\t - configuration file accessible from your current working directory, alright\n";
		} else {
		    if (@_ = &empty_vars(@expect_val)) {
			$error++;
			print "\t   ERROR $error: Can not read your configuration file \'$config_file\' in $orig_dir!\n";
			push (@error, $read_config_rec);	
		    } else {
			print "\t   WARNING: mandatory variables are set but no configuration file could be read in $orig_dir!\n";
			$warning++;
		    }    		
		}
	    } else {
		if (@_ = &empty_vars(@expect_val)) {
		    $error++;
		    print "\t   ERROR $error: Can not find your configuration file \'$config_file\'!\n";
		    push (@error, "No configuration file for $prog_name could be found.\n\tPlease make sure that a file called \'${prefix}.config\'\n\tis located in your current directory ($cwd)\n\tor in your working directory (\'$work_dir\')\n\tor in your home directory (\'$ENV{HOME}\')\n\tor specify a file on the command line\n\te.g. \'$program -c <config.file>\'");	    
		} else {
		    print "\t   WARNING: mandatory variables are set but no configuration file could be found!\n";
		    $warning++;
		}
		chdir($orig_dir);
		$orig_dir = '';
	    }
	}
    }

    if ($config_read) {
	# check that all mandatory fields have values
	$error_now = $error;
	unless ($html_file) {
	    $error++;
	    print "\t   ERROR $error: no file name specified for output HTML!\n";
	    push (@error, "Please specify a file name for output HTML\n\tthrough a statement like \'html_file ${prog_name}_result.html\' in your configuration file");
	} 

	unless ($relpubdate) {
	    $error++;
	    print "\t   ERROR $error: no maximum age for database entries specified !\n";
	    push (@error, "Please specify a maximum age for database entries (in days)\n\tby including a line like \'relpubdate 100\' in your configuration file.\n\tOther valid entries are '1 year','2 years','5 years','10 years','no limit'");
	}
    
	unless ($getmax) {
	    $error++;
	    print "\t   ERROR $error: no maximum number of entries specified !\n";
	    push (@error, "Please specify a maximum number for database entries (in days)\n\tby including a line like \'getmax 20\' in your configuration file.");
	}	    
	unless ($viewdays) {
	    $error++;
	    print "\t   ERROR $error: no value for viewdays specified !\n";
	    push (@error, "Please specify a value for number of days that an entry will be shown\n\tby including a line like \'viewdays 5\' in your configuration file.");
	}

	unless ($include_config) {
	    $error++;
	    print "\t   ERROR $error: handling of configuration file is not specified !\n";
	    push (@error, "Please specify if your configuration file should be appended to your output file\n\tby including a line like \'include_config Y\' in your configuration file.");
	}
	foreach (@warning) {
	    print "\t   WARNING: $_";
	}
	if ($error == $error_now) {
	    if (@warning) {
		print "\t - inconsistencies found in configuration file\n";
	    } else {
		print "\t - configuration file looks fine\n";
	    }
	}
    }
    sleep($pause);

    # DATABASE FILES
    if ($dir_ok) {
	print " - checking database..." ;
	sleep($pause);
    }
    $db_file = ($database or "${prefix}.db");
    if ($work_dir ne $cwd) {
	chdir $work_dir;
    }
    if (-e "$db_file" and $dir_ok) {
	unless (-w "$db_file") {
	    $error++;
	    print "\n\t   ERROR $error: Can\'t write to database \'$db_file\'!\n";
	    push (@error, "Please make your database writeable (chmod +w $db_file)\n");
	} else {
	    print "alright\n";
	}
    } else {
	print "\n\t - no database file found (\'$db_file\')\n\t - WARNING: initialization might take up a lot of space!\n" if ($dir_ok);
	$warning++;
    }
    # reverse changes to cwd and config_file
    if ($orig_dir) {
	chdir($orig_dir);
	$orig_dir = '';
	$config_file = '';
    }	    

    # HEADER:
    print " - checking header...\n";
    sleep($pause);
    if (-e $work_dir.$joiner.$header and $header) {
	$check_header = $work_dir.$joiner.$header;
    } else {
	$check_header = $header;
    }
    if ($check_header) {
	print "\t - trying to read file \'$header\'...";
	if (-r $check_header) {
	    print "alright\n";
	} else {
	    $error++;
	    print "\n\t   ERROR $error: Can not read file \'$check_header\'!\n";
	    push (@error, $header_rec);
	}
    } else {
	print "\t - using automatically generated header for output file, alright\n";
    }    
	
    # INTERNET CONNECTION
    print STDERR " - checking the internet connection...";
    unless ($test_URL) {
	print STDERR "\n\t - WARNING: no test URL available, cannot carry out test!\n";
	$warning++;
    } else {
	if ($lynx) {
	    $rescontent = `$lynx -source \'$test_URL\'`;
	    if ($rescontent =~ /\s*^\w+\: Can't access startfile/ or $rescontent eq '') {
		$error++;
		print "\n\t - Error $error: Can\'t reach test-URL!\n";
		push @error, "Problems encountered when trying to access the test-URL.\n\tPlease check the test-URL:\n\t\'$test_URL\'\n\tor the configuration of \'$lynx\'";
	    } else {
		print "alright\n";
	    }
	} elsif ($proxy) {
	    if ($no_test) {
		print STDERR "\n\t - disabeling \'no_test\'-setting for check...";
		$no_test = 0;
	    }
	    &proxy_setting('check');
	} else {
	    if (&connection_test('no_proxy',$test_URL)) {
		print "alright\n";
	    } else {
		$error++;
		print "\n\t - Error $error: Can\'t reach test-URL!\n";
                $proxy_err = "Problems encountered when trying to access the test-URL.\n\tPlease check the test-URL:\n\t\'$test_URL\'\n\t";
                if ($proxy) {
                    $proxy_err .= "and your proxy settings (command line or line ".(&line_number($config_file,'^\s*#*\s*proxy\b'))." of this script)".($proxy?":\n\t\'$proxy\'".($proxy_port?", \'$proxy_port\'":"")." evaluated to \'$proxy_string\'":"");
                } else {
                    $proxy_err .= "and consider using a proxy-server (using command line option \'-p\' \n\tor setting value for \'proxy\' ".(&line_number($config_file,'^\s*#*\s*proxy\b')).".";
                }
		push @error, $proxy_err; 
	    }
	}
    }

    # GIVE RECOMMENDATIONS:
    if ($error > 0) {
	print "\n$error ";
	print ($error > 1?'errors have ':'error has ');
	print "been detected!\nSome suggestions will be made next on how to solve any problems.\n\nPlease press <return> to continue...";
	<>;
	$tip = 1;
	foreach (@error) {
	    print "\nTIP $tip: $_\n";
	    unless ($tip >= $error) {
		print "\nPress <return> to see next tip...";
		<>;
	    }
	    $tip++;
	}
	print "\nPlease run \'$program -check\' again after any changes made!\n\nEnd of check!\n\n";
    } else {
	print "\nEnd of check, no error detected.";
	if ($warning > 0) {
	    print " $warning warning".($warning > 1?"s.":".");
	}
	print "\nWith this setup the program should run without problems.\n\n";
    }
    exit($EXIT_SUCCESS);
}

################################################################################

sub line_number {
        # find a pattern in a file and print out 
        # the line-number where it occurs the first time
    local $file = shift;
    local $pattern = shift;
    local $line = 0;
    
    open (IN, "$file") or return 'in your configuration file';;
    while (<IN>) {
	$line++;
	if (/$pattern/) {
	    return "at line $line of your configuration file";
	}
    }
    return 'in your configuration file';
}
	

################################################################################

sub read_db {
        # read in a database file which holds
        # previous uids and the time they were retrieved
        # ordered by aliases
    local $db_file = ($database or "${prefix}.db");
    local $alias = '';
    local ($uid, $age);

    return unless (-e "$db_file");

    open (DB, "$db_file") 
	or die "$prog_name ERROR: cannot open database \'$db_file\' ";
    $^W = 0; # switch off warning
    while (chomp($_ = <DB>)) {
	  # identify alias by percent sign at beginning of line
	if (/^%(.*)/) {	    
	    $alias = $1;
	} elsif (/\d/) {
	    ($uid,$age) = split;
	    ${ $db{$alias} }{$uid} = $age;
        }
    }
    $^W = $warn_stat; # set back warning status
    close DB;
}

################################################################################

sub save_db {
#save the updated database:
#(12 Oct 98: saving to a different filename (.temp) and then re-naming 
#to avoid losing the whole database if there's a crash)
    local $db_file = ($database or "${prefix}.db");
    local $alias;

    return if ($cmd_query);
    open(LOG,">$db_file.temp") ||
	 die "$prog_name ERROR: cannot write to database file $db_file.temp";
    
    foreach $alias (keys %aliases) {
	print LOG '%'."$alias\n";
	foreach $uid (keys %{ $db{$alias} }) {
	    print LOG "$uid\t${ $db{$alias} }{$uid}\n";
	}
    }
    close(LOG);

    if ($system =~ /win/i) {
	    # under Windows, the former database has to be
	    # (re)moved before another file can take its name...
	move("$db_file", "$db_file.bak") or 
	    warn "Can not move $db_file to $db_file.bak\n";
    }
    rename("$db_file.temp","$db_file") ||
	warn "$prog_name ERROR: cannot rename temp database file";
}

################################################################################

sub read_config {
        # read in configuration file to set values of variables
        # and to get search criterias, databases and according aliases
    local $field;
    local $val;
    local $alias;
    local $searchtype;
    local $query;
    local $line = 0;
    local %found = ();
    local $open_result;
	
    return if ($config_read);

    $config_dir = cwd;

    $open_result = open (CONFIG,"$config_file");
    unless ($open_result) {
	if ($check) {
	    $config_dir = '';
	    return;
	} else {
	    die "$prog_name ERROR: cannot open configuration file $config_file in directory $config_dir";
	}
    }

  WHILE:while (<CONFIG>){
	
        $line++;
	($_) = split (/\#/);             # remove comments
	($_, undef) = split (/\</, $_, 2);             # remove HTML-tags
        s/\s*$//;                        # clean end of line from white-space
	$^W = 0;                         # disable warnings
	next unless (/\w/);              # skip empty lines
	$^W = $warn_stat;                # set back warning status       
	s/\s+/ /g;                       # reduce multiple whitespaces 
	                                 # to single spaces
	unless (/^\s*$known_searchtypes\s+(.*)/) {      #load general setup data	    
	    ($field,$val)=split(/\s+/, $_, 2);
		    # strip any leading or ending quotes
		$field =~ s/^'|"//;
                $field =~ s/"|'$//;

	        # check if user is allowed to change
                # value of this variable
	    unless (grep /\Q$field/, @allowed_var) {
		print STDERR "$prog_name WARNING: Invalid variable name: $field at line $line of config-file, skipping!\n" unless ($mute);
		$warning++;
		next;
	    }

	        # skip if value has been set
	        # by command line option already:
	    next if (${$field} or ${$field} eq '0');         
	                                 
	    if ($val) {
		    # strip any leading or ending quotes
		$val =~ s/^'|"//;
                $val =~ s/"|'$//;
                    # convert leading tilde to HOME-directory
                $val =~ s/^~/$ENV{'HOME'}/ if ($ENV{'HOME'});
                    # set value
		${$field} = $val;
            }
	}else{                        
	    # extract database, query and alias:
	    $searchtype=$1; #($known_searchtypes is in brackets)
	    $_ = $2;        #load string following search type
	    if (/^\'/) {    #look for alias
		if (s/'/'/g > 2) {
		    if ($check) {
			push @warning, "Too many aliases declared for $_\n";
			$warning++;
		    } else {
			print STDERR "$prog_name WARNING: Too many aliases declared, dismissing $_!\n" unless ($mute);
		    }
		} else {
		    (undef,$alias,$_) = split /\'/;
		}
	    } else {        # use query for alias if none specified
		$alias = $_;
	    }
	        # standard format for queries:
	    s/^\s*//;      # delete leading white space
	    s/\s+/\+/g;             #put in plusses
	    $query = uc;                #convert to all uppercase
	        # check if query exists already
	    if ($query{$query}) {
		if (($query{$query}{'ALIAS'} eq $alias) and
		    ($query{$query}{'DB'} eq $searchtype)) {
		    if ($check) {
			push @warning, "Double entrance for $query\n"; 
			$warning++;
		    } else {
			print STDERR "$prog_name WARNING: Double entrance for $query, dismissing one\n" unless ($mute);
		    }
		} else {
		    $query .= '#2';
		}
	    }
	        # store query:
	    push @query_order, $query;
	    $query{$query}{'ALIAS'} = $alias;	    
	    $query{$query}{'DB'} = $searchtype;
	        # group queries according to their alias:
	    foreach $item (@{ $aliases{$alias} }) {
		   # make sure they all query the same database
		if ($query{$item}{'DB'} ne $searchtype) {
		    if ($check) {
			push @warning, "Inconsistency in databases for alias \'$alias\', dismissing alias for query $query.\n"; 
			$warning++;
		    } else {
			print STDERR "$prog_name WARNING: Database of query $query differs from other queries with same alias, dismissing alias.\n" unless ($mute);
		    }
		    push @{ $aliases{$query} }, $query;
		    $query{$query}{'ALIAS'} = $query;
		    next WHILE;
		}
	    }
	       # if we got here, all databases for queries sharing this alias
	       # are the same and we can safely add this query...
            unless ($found{$alias}) {
	        push @alias_order, $alias;
		$found{$alias} = '1';
	    }
	    push @{ $aliases{$alias} }, $query;
	}
    }
    $^W = $warn_stat;      # set back warning status       
    close (CONFIG);

    $config_read = 1;
}

################################################################################

sub empty_vars {
      # checks if all submitted variables have values set
      # returns the name of variables without value
    local @expect_val = @_;
    local @no_val = ();

    foreach (@expect_val) {
	push (@no_val, $_) unless (${$_});
    }
    return @no_val;
}


__END__


####------------------------END OF PROGRAM--------------------------####
#                           ==============                             #
####----------------------------------------------------------------####


####--------------------POD-text starts here: ----------------------####
#                       =====================                          #
#  (You can try to read it as it is or convert it into a nicer format  #
#  with one of the programs pod2html, pod2man or pod2latex, that are   #
#  normally part of a Perl-distribution.)                              #

=head1 NAME

PubCrawler - Automated Retrieval of PubMed and GenBank Reports

=head1 SYNOPSIS

    usage: PubCrawler [-add_path -c <config_file>] [-check] [-d <directory>]   
	   [-db <database>] [-fullmax <max-docs in full>]   
	   [-getmax <max-docs to get>] [-h] [-help] [-head <output-header>]    
	   [-i] [-l <log_file>] [-lynx <alternative-browser>] [-mute] [-no_test]   
	   [-os <operating_system>] [-out <output-file>] [-p <proxy_server>]     
	   [-pp <proxy_port>] [-pauth <proxy_authorization>]    
	   [-ppass <proxy_password>] [-pre <prefix>] [-q <query_URL>]  
	   [-relpubdate <relative-publication-date>] [-s <search-term]
	   [-t <timeout>] [-u <test_URL>] [-v <verbose>] [-viewdays <view-days>]

    options:
    -add_path adds the path /cwd/lib to @INC (list of library directories)
              where cwd stands for the current working directory
    -c       configuration file for pubcrawler
    -check   checks if program and additional files are setup correctly
    -d       pubcrawler working directory (config,databas,and output)
    -db      name of database file
    -fullmax maximum number of full length reports shown (per search)
    -getmax  maximum number of documents to retrieve (per search)
    -h       this help message
    -head    HTML-header for output file
    -help    same as -h
    -i       include configuration file in HTML-output
    -l       name of file for log-information
    -lynx    command for alternative browser
    -mute    suppresses messages to STDERR
    -no_test skips the proxy-test
    -os      operating system (some badly configured versions of Perl need  
	     this to be set explicitly -> 'MacOS', 'Win', and 'Unix')
    -out     name of file for HTML-output
    -p       proxy
    -pp      proxy port
    -pauth   proxy authorization (user name)
    -ppass   proxy password
    -pre     prefix used for default file names (config-file,database,log)
    -q       query URL
    -relpubdate maximum age (relative date of publication in days) 
	     of a document to be retrieved
             other valid entries: '1 year','2 years','5 years','10 years','no limit'
    -s       search-term ('database#alias#query#')
    -t       timeout (in seconds, defaults to 180)
    -u       test-URL (to test proxy configuration)
    -v       verbose output
    -viewdays number of days each document will be shown

Some command-line options can also be set in the configuration file.
If both are set and they conflict, the command-line setting takes priority.

=head1 DESCRIPTION

PubCrawler automates requests for user-specific searches in the PubMed-
and the GenBank-database at NCBI at http://www.ncbi.nlm.nih.gov/ .

=head1 USAGE

=head2 Testing

To test if everything is setup correctly, run the program in B<check-mode> first. At your command prompt, enter the name of the script together with any command line options that might be necessary and the B<-check> option. 

Mac users can do this either by setting the B<check>-variable in their configuration file to '1', or by entering B<-check> at the command line (when the B<prompt>-variable in the configuration file is set to '1', the user will be prompted for command-line options after the start of the program).

Windows users can start the Perl executable in a DOS-box followed by the name of the script, followed by any necessary parameters, including the B<-check> option. 

The program will perform a quick check on all the settings and report any errors that it encountered.

B<Recommended when using the program for the first time!>

=head2 Customization

PubCrawler allows two forms of customization:

=over 4

=item command-line options

For a temporary change of parameters the command-line options as listed in L<SYNOPSIS> can be used.

=item configuration file

Permanent customization of settings can be achieved by editing the PubCrawler configuration file. 

It is divided into three parts: L<Mandatory Variables>, L<Optional Variables> and L<Search Terms>.

The value of any variable can be set by writing the variable name and its value separated by a blank on a single line in the configuration file. The value can consist of several words (eg. Mac-directory-name). Any leading or trailing quotes will be stripped, when the data is read in.

Each search-specification has to be written on one line.
The first word must specify the database (genbank or pubmed).
Any following words enclosed in single quotes (') will be used
as an alias for this query, otherwise they will be considered
Entrez-search-terms, as will the rest of the line.
You must not enclose anything but the alias in single quotes!

=back

=head2 Automated Execution

PubCrawler makes most sense if you have it started automatically, for example overnight. This would assure that your database is always up to date.

The automation depends on the operating system you are using. In the following a few hints are given how to set up schedules:


=over 4

=item UNIX/LINUX

Unix and Unix-like operating systems normally have the cron-daemon running, which is responsible for the execution of scheduled tasks.

The program B<crontab> lets you maintain your crontab files. To run PubCrawler every night at 1:00 a.m. execute the following command:

    crontab -e

and insert a line like the following:

    0 1 * * * $HOME/bin/PubCrawler.pl

presuming your executable resides in $HOME/bin/PubCrawler.pl. Any command line options can be added as desired.

For more information about crontab type C<man crontab>.

=item WINDOWS 95/98

Windows 98 and the Plus!-pack for Windows 95 come with the ability to schedule tasks.
To set up a schedule for PubCrawler, proceed as follows:

Start the I<Task Scheduler> by clicking on

    Start->Programs->Accessories->System Tools->Scheduled Tasks

Follow the instructions to add or create a new task.
Select the Perl-executable (I<perl.exe>), type in a name for the task and choose 
time and day of start.
Open the advanced properties and edit the I<Run:>-line by adding the name of the script and any command line options. It might look something like the following:

Run: C<C:\perl\bin\perl.exe D:\pubcrawler\PubCrawler.exe -d D:\pubcrawler>

You might also consider entering the working directory for PubCrawler in the I<Start in:>-line.

=item MACINTOSH

Unfortunately the Mac operating system does not have a built in scheduler, but there is a shareware program available called B<Cron for Macintosh>.
It costs $10 and has a web-site at http://gargravarr.cc.utexas.edu/cron/cron.html

=back


=head1 MISCELLANEOUS

=head2 Location of the Configuration File

By default PubCrawler looks for a configuration file named PubCrawler.config (or more precisely, B<I<prefix>.config> if the prefix has been changed by using the C<-pre> command-line option).
The first part of the name (I<prefix>) defaults to the main part of the program name (the basename 
up to the last dot, normally I<PubCrawler>).
Other prefixes can be specified via the command line option C<-pre> followed by a prefix.
Other names for the configuration file can be specified via the command line 
option C<-c> followed by the file name.
Using different prefix names allows multiple uses of the program with multiple output files.

The program will first look for a configuration file in the PubCrawler working directory,
if specified via command line option. The second place to look for it is the
home directory as set in the environmental variable 'HOME'. Last place is
the current working directory. If no configuration file could be found and
not all of the L<Mandatory Variables> are specified via command line options
(see L<SYNOPSIS>) then the program exits with an error message.

=head2 Mandatory Variables

There is a number of variables for which PubCrawler expects values given by the user. These are:

=over 4

=item html_file

name of the ouput-file

=item viewdays

number of days each document will be shown

=item relpubdate

maximum age (in days) of database entries to be reported


=item getmax 

maximum number of documents to be retrieved for each search carried out

=item fullmax 

the maximum number of documents for which a full report is being presented

=item include_config 

whether or not to append config-file to output

=item search_URL 

URL from which documents are being requested


=back


The values for these variables can be specified either in the PubCrawler configuration file or as command-line options (see L<SYNOPSIS>).

=head2 Optional Variables

For some variables an optional value can be set in the configuration file. These are:

=over 4

=item work_dir

working directory for PubCrawler

=item check

PubCrawler runs in check-mode if set to '1' (see L<Testing>)

=item prompt

PubCrawler prompts Mac-users for command-line options if set to '1'

=item verbose

PubCrawler prints log messages to screen if set to '1'

=item lynx

PubCrawler uses specified command to evoke command-line browser for HTTP-requests
(see also L<Perl-Modules and Alternative Choice>)

=item header

location of header (in HTML-format) that will be used for the output file

=item prefix

alternative prefix for standard files (configuration, database, log)

=item system

explicit assignment of operating system ('MacOS','Win','Unix', or 'Linux')

=item proxy

proxy server for internet connection

=item proxy_port

port of the proxy server, defaults to 80

=item proxy_auth

user name for proxy authorization

=item proxy_pass

pass-word for proxy authorization

B<CAUTION:> Storing passwords in a file represents a possible security risk!

=item time_out

time in seconds to wait for internet responses, defaults to 180

=item test_URL

URL for test of proxy-configuration

=item no_test

disables test of proxy-configuration

=back

=head2 Search Terms

The definition of the search terms is the same as given by I<Entrez Search System>. Please look up their site for more information ( http://www.ncbi.nlm.nih.gov/Entrez/entrezhelp.html#ComplexExpression )

A search-term entered at the command-line (e.g. C<pubcrawler.pl -s 'pubmed#Test-search#Gilbert [AUTH]'>) hides all other queries specified in the configuration script.

=head2 Perl-Modules and Alternative Choice

The HTTP-requests are handled by
Perl-modules, namely B<LWP::Simple>, B<LWP::UserAgent>, and B<HTML::Parser>. These are included in the latest distributions
of Perl for Windows and MacPerl 
(check out http://www.perl.com/pace/pub/perldocs/latest.html ). 
They are also freely available
from the CPAN-archive at http://www.perl.com/CPAN/.

In case you would like to run PubCrawler without these modules you
have to provide a command-line browser as an alternative (like B<lynx>,
available at http://lynx.browser.org ).

To disable the use of the modules comment out the three lines following 
I<#### ADDITIONAL MODULES ####> at the beginning of the file by putting a 'I<#>'
in front of them.
You also have to comment out the line 

    @proxy_config = split /\n/, (get "$proxy_conf");

in the subroutine I<proxy_setting>, somewhere near the middle of the program script (approx. line 1142).

=head2 Setup of Queries

Only queries to the same databases are allowed to share the same alias.
If queries with the same alias but different databases are detected,
their alias will be changed to the query-name. That means that the results
for this entry will appear in their own section.

=head1 REPORTING PROBLEMS

If you have difficulty installing the program or if it doesn't produce
the expected results, please read this documentation carefully and also
check out PubCrawlers web-page at http://acer.gen.tcd.ie/pubcrawler

If none of this advice helps you should send an e-mail with a detailed 
description of the problem to pubcrawler@acer.gen.tcd.ie

=head1 DOCUMENTATION

=head1 AUTHOR

Original author:  Dr. Ken H. Wolfe, khwolfe@tcd.ie

Second author: Karsten Hokamp, khokamp@tcd.ie

Both from the Department of Genetics, Trinity College, Dublin

If you have problems, corrections, or questions, please see
L<"REPORTING PROBLEMS"> above.

=head1 REDISTRIBUTION

Please do not distribute any modified version of this program!

This program has a homepage at at http://acer.gen.tcd.ie/pubcrawler .

=head1 LAST MODIFIED

$Id: pubcrawler.pl,v 0.95 1999/02/25 11:39:13 pubcrawl Exp pubcrawl $

