#!/usr/bin/python # # Creative Commons Attribution License # http://creativecommons.org/licenses/by/2.5/ # # Trevor Strohman # First release: 20 June 2006 # """ acm_fetch Fetches BibTeX entries for papers stored in the ACM Digital Library. It uses Google to find the ACM citation entry, then scrapes the ACM page to get the BibTeX data. The BibTeX entry can also be added to a BibTeX file of your choice. Requires one of these: Google (note that Google is no longer giving out SOAP keys) pygoogle (pygoogle.sf.net) A valid Google API license key, in ~/.googlekey Yahoo: The Yahoo python development library (part of the SDK) A valid Yahoo API license key, in ~/.yahookey Note that Yahoo's coverage of the ACM digital library isn't very good right now. Example usage: % python acm_fetch.py Learning Relational Probability Trees @inproceedings{956830, author = {Jennifer Neville and David Jensen and Lisa Friedland and Michael Hay}, title = {Learning relational probability trees}, booktitle = {KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining}, year = {2003}, isbn = {1-58113-737-0}, pages = {625--630}, location = {Washington, D.C.}, doi = {http://doi.acm.org/10.1145/956750.956830}, publisher = {ACM Press}, address = {New York, NY, USA}, abstract = {Classification trees are widely used...} local-url = {/Users/trevor/...}, md5 = {...} } % To automatically add the paper to your BibTeX file, make a file called ~/.acmrc with contents like: % cat < ~/.acmrc [acm] bibtex-file = /Users/trevor/Documents/School/Research/tex/biblio.bib paper-dir = /Users/trevor/Documents/School/Papers EOF """ import sys, urllib import md5 import os import ConfigParser # # fetch_page # def fetch_page( url ): # fetch the main page main_handle = urllib.urlopen( url ) main_lines = main_handle.readlines() main_handle.close() return main_lines # # find_line_number # def find_line_number( lines, startNo, data ): for i in range(startNo,len(lines)): if data in lines[i]: return i return -1 # # fetch_bibtex # def fetch_bibtex( main_lines ): bibtexLine = find_line_number( main_lines, 0, "popBibTex" ) urlbib = main_lines[bibtexLine].split("\'")[1] urlbib = "http://portal.acm.org/" + urlbib bibdata = urllib.urlopen( urlbib ).readlines() start = find_line_number( bibdata, 0, " 0 and head[-1] == c: head = head[:-1].strip() else: break return head + ",\n %s = {%s}\n}" % (key, value) # # fetch_abstract # def fetch_abstract( main_lines ): abstract = "" abstractLine = find_line_number( main_lines, 0, "ABSTRACT" ) if abstractLine > 0: endAbstractLine = find_line_number( main_lines, abstractLine, "" ) abstractLines = main_lines[abstractLine+1:endAbstractLine] abstract = "".join( abstractLines ) while abstract.find( '<' ) >= 0: start = abstract.find( '<' ) end = abstract.find( '>', start+1 ) abstract = abstract[:start] + abstract[end+1:] abstract = abstract.strip() return abstract else: return "" # # construct_bibtex # def construct_bibtex( main_lines, hash, localUrl ): abstract = fetch_abstract( main_lines ) bibtex = fetch_bibtex( main_lines ) bibtex = append_bibtex( bibtex, "abstract", abstract ) bibtex = append_bibtex( bibtex, "md5", hash ) bibtex = append_bibtex( bibtex, "local-url", localUrl ) return bibtex # # store_bibtex # def store_bibtex( filename, bibtex ): f = file( filename, "a" ) print >> f, bibtex print >> f f.close() # # fetch_pdf # def fetch_pdf( filename, main_lines ): fulltextLine = find_line_number( main_lines, 0, "FullText" ) fulltexturl = main_lines[fulltextLine].split("HREF=\"")[1].split("\"")[0] fulltexturl = "http://portal.acm.org/" + fulltexturl pdf_handle = urllib.urlopen( fulltexturl ) data = pdf_handle.read() pdf_handle.close() f = file(filename, "w") f.write(data) f.close() return (md5.new(data).hexdigest(), filename) # # do_config # def do_config( ): result = {} config = ConfigParser.SafeConfigParser() config.read( os.path.expanduser('~/.acmrc') ) try: result["paper-dir"] = config.get( "acm", "paper-dir" ) except: pass try: result["bibtex-file"] = config.get( "acm", "bibtex-file" ) except: pass return result # # process_result # def process_result( url ): config = do_config() lines = fetch_page( url ) hash = None filename = None id = url.split( '=' )[1].strip() if "paper-dir" in config: print "Fetching paper" (hash, filename) = fetch_pdf( config['paper-dir'] + "/acm-%s.pdf" % id, lines ) bibtex = construct_bibtex( lines, hash, filename ) print bibtex if "bibtex-file" in config: store_bibtex( config["bibtex-file"], bibtex ) # # paper_disambiguate # def paper_disambiguate( possible ): if len(possible) == 0: return None elif len(possible) == 1: return possible[0][1] if len(possible) > 1: for i in range(0, len(possible)): print u"%d: %s" % (i, filter(lambda x: ord(x)<128, possible[i][0])) index = int(raw_input( "Which one? " )) return possible[index][1] # # find_paper_google # def find_paper_google( query ): data = google.doGoogleSearch( query + " site:portal.acm.org" ) results = data.results possible = [] for res in results: if "portal.acm.org" in res.URL and "citation.cfm" in res.URL: possible.append( (res.title, res.URL) ) return paper_disambiguate( possible ) # # find_paper_yahoo # def find_paper_yahoo( query ): yahoo_id = file(os.path.expanduser('~/.yahookey')).read().strip() query += " site:portal.acm.org" print query search = create_search("Web", yahoo_id, query=query, results=50) dom = search.get_results() results = search.parse_results(dom) possible = [] for res in results: if "portal.acm.org" in res.Url and "citation.cfm" in res.Url: possible.append( (res.Title, res.Url) ) return paper_disambiguate( possible ) # # main function # if __name__ == "__main__": quote = '"' query = quote + (quote + " " + quote).join(sys.argv[1:]) + quote url = None if os.path.exists( os.path.expanduser('~/.googlekey') ): import google url = find_paper_google( query ) elif os.path.exists( os.path.expanduser('~/.yahookey') ): from yahoo.search.webservices import create_search url = find_paper_yahoo( query ) else: print "No search API key found!" process_result(url)