#!/usr/bin/perl -w

use strict;
use Getopt::Std;
use LWP::Simple;

# declare variables and subroutines
my ($i, $j, $doc, $author, $authorformatted, @authorbits, $bit, $journal, $usage);
my ($arxiventry, @lines, $title, @authors, $comments, @commentlist, $year, $substring);

# to clean HTML characters
sub cleanhtmlchars {
    my $inline = shift(@_);
    # known characters
    $inline =~ s/&quot;//g;
    $inline =~ s/&amp;/&/g;
    $inline =~ s/&lt;/\$<\$/g;
    $inline =~ s/&gt;/\$>\$/g;
    $inline =~ s/&tilde;/\\~/g;
    $inline =~ s/&nbsp;/ /g;

    # special hex characters
    while ($inline =~ /&\#x([0-9a-fA-F]+);/) {
	my $char = chr(hex($1));
	$inline =~ s/&\#x$1;/$char/g;
    }

    # remove unknown characters
    $inline =~ s/&[a-zA-Z0-9]+;//g;

    return($inline);
}

# define usage
$usage = "Fetches a preprint entry from arxiv.org and converts it to Bibtex

USAGE:
  $0 [options...] <arxiv.org entry>

OPTIONS:
  -h           : display this help text

EXAMPLE:
  $0 astro-ph/0603001
  $0 arXiv:0704.2601\n";

# get options
my %Options;
if (!getopts('h', \%Options)) {
    die("$usage");
}

# need to specify at least one argument
if ($#ARGV != 0) {
    die("$usage");
}

# print help if necessary
if ($Options{h}) {die("$usage");}

# store the arxiventry in a variable
$arxiventry = $ARGV[0];
if ($arxiventry =~ /^[0-9]/) {
    $arxiventry = "arXiv:".$arxiventry;
}

# download and split on newlines
$doc = get("http://www.arxiv.org/abs/$arxiventry");
@lines = split(/\n/, $doc);

# zero out variables that may never be set
$journal = "";
$comments = "";

# loop through lines of HTML file and extract info
for ($i=0; $i<=$#lines; $i++) {
    # warn on Journal-ref
    if ($lines[$i] =~ /<div class="tablecell jref">/) {
	warn("WARNING: This paper has a journal reference.\n");
	warn("WARNING: You should probably download the proper Bibtex entry from ADS.\n");
    }

    # title
    if ($lines[$i] =~ /^<h1 class="title">/) {
	$title = "";
	while ($lines[$i] !~ /<\/h1>$/) {
	    $i++;
	    if ($lines[$i] =~ /^[\s]*(.+)<\/h1>/ || $lines[$i] =~ /^[\s]*(.+)$/) {
		$substring = $1;
		if ($title =~ /^$/) {
		    $title = $substring;
		} else {
		    $title = $title." ".$substring;
		}
	    }
	}
	$title = cleanhtmlchars($title);
	$title =~ s/[\s]+/ /g;
    }

    # authors
    if ($lines[$i] =~ /Authors:/) {
	$i++;
	while ($lines[$i] =~ /au:/) {
	    $lines[$i] =~ /\">(.+)<\/a>/;
	    $author = cleanhtmlchars($1);
	    $author =~ s/\./\. /g;
	    $author =~ s/[\s]+/ /g;
	    @authorbits = split(/ /, $author);
	    $authorformatted = "\{$authorbits[$#authorbits]\}";
	    if ($#authorbits >= 1) {
		$authorformatted = $authorformatted.", ";
		for ($j=0; $j<$#authorbits; $j++) {
		    $bit = $authorbits[$j];
		    if ($bit !~ /\./) {
			$bit =~ s/([A-Z]).+/$1\./g;
		    }
		    if ($j < $#authorbits-1) {
			$authorformatted = $authorformatted."$bit\~";
		    } else {
			$authorformatted = $authorformatted."$bit"
		    }
		}
	    }
	    push(@authors, $authorformatted);
	    $i++;
	}
    }

    #print @lines;
    #exit;
    
    # comments & (hopefully) journal
    if ($lines[$i] =~ /Comments:/) {
	while ($lines[$i] !~ /^<td class="tablecell comments">/) {
	    $i++;
	}
	if ($lines[$i] =~ /^<td class="tablecell comments">(.+)<\/td>/) {
	    $comments = $1;
	} else {
	    $lines[$i] =~ /^<td class="tablecell comments">(.+)/;
	    $comments = $1;
	    while ($lines[$i] !~ /<\/td>$/) {
		$i++;
		if ($lines[$i] =~ /^[\s]*(.+)<\/td>/ || $lines[$i] =~ /^[\s]*(.+)$/) {
		    $substring = $1;
		    if ($comments =~ /^$/) {
			$comments = $substring;
		    } else {
			$comments = $comments." ".$substring;
		    }
		}
	    }
	}

	# process comments
	$comments = cleanhtmlchars($comments);	
	$comments =~ s/[\s]+/ /g;
	@commentlist = split(/[,\.;]/, $comments);
	for ($j=0; $j<=$#commentlist; $j++) {
	    if ($journal =~ /^$/) {
		if ( ($commentlist[$j] =~ /^([\s]*[Ss]ubmitted[\s]*)$/ 
		      || $commentlist[$j] =~ /^([\s]*[Aa]ccepted[\s]*)$/ 
		      || $commentlist[$j] =~ /^([\s]*[Tt]o[\s]+[Aa]ppear[\s]*)$/ 
		      || $commentlist[$j] =~ /^([\s]*[Ii]n[\s]+[Pp]ress[\s]*)$/)
		     && ($j > 0) ) {
		    $journal = $commentlist[$j-1].",".$1;
		    $journal =~ s/^[\s]*//g;
		    $journal =~ s/[\s]*$//g;
		    $journal = $journal." ";
		} elsif ($commentlist[$j] =~ /(.*[Ss]ubmitted.*)/ 
			 || $commentlist[$j] =~ /(.*[Aa]ccepted.*)/
			 || $commentlist[$j] =~ /(.*[Aa]ppear.*)/
			 || $commentlist[$j] =~ /(.*[Pp]ress.*)/) {
		    $journal = $1;
		    $journal =~ s/^[\s]*//g;
		    $journal =~ s/[\s]*$//g;
		    $journal = $journal." ";
		}
	    }
	}
    }
}

# append arxiv info to journal entry
$journal = $journal."($arxiventry)";

# extract year
$arxiventry =~ /([0-9][0-9])[0-9]+/;
$year = $1;
if ($year >= 80) {
    $year = "19".$year;
} else {
    $year = "20".$year;
}

# finally, print Bibtex entry to stdout
print("%%% Bibtex entry created with $0\n");
print("\@ARTICLE{$arxiventry,\n");
print("   author = {");
for ($i=0; $i<=$#authors; $i++) {
    if ($i < $#authors) {
	print("$authors[$i] and ");
    } else {
	print("$authors[$i]},\n");
    }
}
print("    title = \"{$title}\",\n");
print("  journal = {$journal},\n");
print("     year = $year,\n");
print(" comments = {$comments}\n");
print("}\n");
