#!/usr/bin/perl -w use strict; use Getopt::Std; use LWP::Simple; # declare variables and subroutines my ($i, $j, $doc, $author, $authorformatted, @authorbits, $bit, $journal, $usage); my ($arxiventry, @lines, $title, @authors, $comments, @commentlist, $year, $substring); # to clean HTML characters sub cleanhtmlchars { my $inline = shift(@_); # known characters $inline =~ s/"//g; $inline =~ s/&/&/g; $inline =~ s/</\$<\$/g; $inline =~ s/>/\$>\$/g; $inline =~ s/˜/\\~/g; $inline =~ s/ / /g; # special hex characters while ($inline =~ /&\#x([0-9a-fA-F]+);/) { my $char = chr(hex($1)); $inline =~ s/&\#x$1;/$char/g; } # remove unknown characters $inline =~ s/&[a-zA-Z0-9]+;//g; return($inline); } # define usage $usage = "Fetches a preprint entry from arxiv.org and converts it to Bibtex USAGE: $0 [options...] OPTIONS: -h : display this help text EXAMPLE: $0 astro-ph/0603001 $0 arXiv:0704.2601\n"; # get options my %Options; if (!getopts('h', \%Options)) { die("$usage"); } # need to specify at least one argument if ($#ARGV != 0) { die("$usage"); } # print help if necessary if ($Options{h}) {die("$usage");} # store the arxiventry in a variable $arxiventry = $ARGV[0]; if ($arxiventry =~ /^[0-9]/) { $arxiventry = "arXiv:".$arxiventry; } # download and split on newlines $doc = get("http://www.arxiv.org/abs/$arxiventry"); @lines = split(/\n/, $doc); # zero out variables that may never be set $journal = ""; $comments = ""; # loop through lines of HTML file and extract info for ($i=0; $i<=$#lines; $i++) { # warn on Journal-ref if ($lines[$i] =~ /
/) { warn("WARNING: This paper has a journal reference.\n"); warn("WARNING: You should probably download the proper Bibtex entry from ADS.\n"); } # title if ($lines[$i] =~ /^

/) { $title = ""; while ($lines[$i] !~ /<\/h1>$/) { $i++; if ($lines[$i] =~ /^[\s]*(.+)<\/h1>/ || $lines[$i] =~ /^[\s]*(.+)$/) { $substring = $1; if ($title =~ /^$/) { $title = $substring; } else { $title = $title." ".$substring; } } } $title = cleanhtmlchars($title); $title =~ s/[\s]+/ /g; } # authors if ($lines[$i] =~ /Authors:/) { $i++; while ($lines[$i] =~ /au:/) { $lines[$i] =~ /\">(.+)<\/a>/; $author = cleanhtmlchars($1); $author =~ s/\./\. /g; $author =~ s/[\s]+/ /g; @authorbits = split(/ /, $author); $authorformatted = "\{$authorbits[$#authorbits]\}"; if ($#authorbits >= 1) { $authorformatted = $authorformatted.", "; for ($j=0; $j<$#authorbits; $j++) { $bit = $authorbits[$j]; if ($bit !~ /\./) { $bit =~ s/([A-Z]).+/$1\./g; } if ($j < $#authorbits-1) { $authorformatted = $authorformatted."$bit\~"; } else { $authorformatted = $authorformatted."$bit" } } } push(@authors, $authorformatted); $i++; } } #print @lines; #exit; # comments & (hopefully) journal if ($lines[$i] =~ /Comments:/) { while ($lines[$i] !~ /^/) { $i++; } if ($lines[$i] =~ /^(.+)<\/td>/) { $comments = $1; } else { $lines[$i] =~ /^(.+)/; $comments = $1; while ($lines[$i] !~ /<\/td>$/) { $i++; if ($lines[$i] =~ /^[\s]*(.+)<\/td>/ || $lines[$i] =~ /^[\s]*(.+)$/) { $substring = $1; if ($comments =~ /^$/) { $comments = $substring; } else { $comments = $comments." ".$substring; } } } } # process comments $comments = cleanhtmlchars($comments); $comments =~ s/[\s]+/ /g; @commentlist = split(/[,\.;]/, $comments); for ($j=0; $j<=$#commentlist; $j++) { if ($journal =~ /^$/) { if ( ($commentlist[$j] =~ /^([\s]*[Ss]ubmitted[\s]*)$/ || $commentlist[$j] =~ /^([\s]*[Aa]ccepted[\s]*)$/ || $commentlist[$j] =~ /^([\s]*[Tt]o[\s]+[Aa]ppear[\s]*)$/ || $commentlist[$j] =~ /^([\s]*[Ii]n[\s]+[Pp]ress[\s]*)$/) && ($j > 0) ) { $journal = $commentlist[$j-1].",".$1; $journal =~ s/^[\s]*//g; $journal =~ s/[\s]*$//g; $journal = $journal." "; } elsif ($commentlist[$j] =~ /(.*[Ss]ubmitted.*)/ || $commentlist[$j] =~ /(.*[Aa]ccepted.*)/ || $commentlist[$j] =~ /(.*[Aa]ppear.*)/ || $commentlist[$j] =~ /(.*[Pp]ress.*)/) { $journal = $1; $journal =~ s/^[\s]*//g; $journal =~ s/[\s]*$//g; $journal = $journal." "; } } } } } # append arxiv info to journal entry $journal = $journal."($arxiventry)"; # extract year $arxiventry =~ /([0-9][0-9])[0-9]+/; $year = $1; if ($year >= 80) { $year = "19".$year; } else { $year = "20".$year; } # finally, print Bibtex entry to stdout print("%%% Bibtex entry created with $0\n"); print("\@ARTICLE{$arxiventry,\n"); print(" author = {"); for ($i=0; $i<=$#authors; $i++) { if ($i < $#authors) { print("$authors[$i] and "); } else { print("$authors[$i]},\n"); } } print(" title = \"{$title}\",\n"); print(" journal = {$journal},\n"); print(" year = $year,\n"); print(" comments = {$comments}\n"); print("}\n");