This is code to extract article metadata and PDF download links for articles from bioRxiv, as identified by entries in a file named biorxiv_dois.txt.
Here’s a sample article page.
#!/usr/bin/python # -*- coding: utf-8 -*- # Script to open, download, and parse every article page on bioRxiv # specified in the file biorxiv_dois.txt (this should be all of them), # extract relevant information, write it to a file, and download # the earliest and latest uploaded preprint PDFs of the paper. import codecs # Helps with character encodings from selenium import webdriver # Web browser automation tools from selenium.webdriver.common.keys import Keys # ditto from bs4 import BeautifulSoup as bs # HTML parser from slugify import slugify # Turns strings into nice filenames import pickle # Used to save data (anything!) to a file and get it back later import time # Includes the sleep() command, which pauses for X seconds import os # Useful for file system operations import sys # Random system functions import urllib # Another library for downloading stuff from the web import operator # Useful for sorting data in ascending or descing order # Make sure we're using UTF-8 reload(sys) sys.setdefaultencoding("utf-8") # Set to True to re-download all PDFs downloadPapers = False # Open some files for output, logging, etc. allfile = codecs.open("biorxiv_all_dois.txt", 'w', 'utf-8') outfile = codecs.open("biorxiv_published_dois.txt", 'w', 'utf-8') smallout = codecs.open("biorxiv_pub_dois.txt", 'w', 'utf-8') logfile = codecs.open("scrape_log.txt", 'w', 'utf-8') journalCounts = {} def cacheQuery(query, forceUncache=False): # Function to open, download and cache (pickle) a specified website. # If the site is already cached, just return it. # If forceUncache is True, re-download the page even if it's already cached. print "querying " + query queryFile = 'articlePages/' + slugify(query) if ((not forceUncache) and os.path.isfile(queryFile)): data = pickle.load(open(queryFile, 'rb')) else: # Play nice time.sleep(1) # Start the mind-controlled browser chromedriver = "/usr/bin/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver browser = webdriver.Chrome(chromedriver) # Open the page in the browser browser.get(query) # Get the contents of the page data = browser.page_source # Save the contents to a file pickle.dump(data, open(queryFile, 'wb')) # Close the mind-controlled browser browser.quit() return data def processArticlePage(page, query, bxDOI): # Function to parse an article page, e.g., # http://biorxiv.org/content/early/2016/07/22/059899 # given the contents of the page global outfile, newVersions, publishedDOIs, journalCounts # Load the page HTML into the BeautifulSoup parser html = bs(page, "lxml") # Get the page metadata meta = html.find('head') # Find the download link for the latest PDF downloadLink = "NONE" dLink = meta.find('meta', attrs={'name': 'citation_pdf_url'}) if (dLink is not None): downloadLink = dLink.get('content') # Find the article abstract text in the page metadata abstract = "NONE" absElt = meta.find('meta', attrs={'name': 'DC.Description'}) if (absElt is not None): abstract = absElt.get('content').replace("\n", "").strip() # Find the article's publication date (if known) in the metadata pubDate = "NONE" pubElt = meta.find('meta', attrs={'name': 'DC.Date'}) if (absElt is not None): pubDate = pubElt.get('content') content = html.find('section', attrs={'id': 'section-content'}) # If we can't find the download link in the metadata, search for it in the # content section of the page if (downloadLink == "NONE"): contentPanes = content.find_all('div', attrs={'class': 'pane-content'}) for pane in contentPanes: pLink = pane.find('a') if ((pLink is not None) and (pLink.get('href').find('.full.pdf') >= 0)): downloadLink = pLink.get('href') # Find the publication journal and DOI jnlStr = "NONE" doiStr = "NONE" pub_info = content.find('div', attrs={'class': 'pub_jnl'}) if (pub_info is not None): pubStr = pub_info.text.replace("\n", "").strip() if (pubStr.find('doi') > 0): publishedDOIs += 1 pubA = pubStr.split(' doi: ') doiStr = pubA[-1] jnlStr = pubA[0].replace("Now published in ", "") # Parse the version upload history for the paper versionsStr = "NONE" earlierVersions = [] versionsPanel = content.find('div', attrs={'class': 'pane-highwire-versions'}) if (versionsPanel is not None): versions = versionsPanel.find_all('a', attrs={'class': 'hw-version-previous-link'}) newVersions += 1 for version in versions: versionLink = version.get('href') earlierVersions.append(versionLink) versionsStr = "|".join(earlierVersions) # Write all of the parsed data about the article to various output files alldata = [bxDOI, pubDate, jnlStr, doiStr, versionsStr] allfile.write("\t".join(alldata) + "\n") if (jnlStr != "NONE"): outdata = [bxDOI, query, downloadLink, pubDate, jnlStr, doiStr, versionsStr, abstract] smalldata = [bxDOI, jnlStr, doiStr] outfile.write("\t".join(outdata) + "\n") smallout.write("\t".join(smalldata) + "\n") if (jnlStr in journalCounts): journalCounts[jnlStr] += 1 else: journalCounts[jnlStr] = 1 # Download most recent version as PDF if (downloadPapers and (downloadLink != "NONE")): latestVersionURL = downloadLink localDest = "biorxiv_latest/" + bxDOI.replace('/', '_') + ".pdf" if (not os.path.isfile(localDest)): try: print "Downloading " + latestVersionURL + " into " + localDest urllib.urlretrieve(latestVersionURL, localDest) time.sleep(10) except (urllib.ContentTooShortError, IOError), e: print "Error downloading " + latestVersionURL + " " + str(sys.exc_info()) + " - " + repr(e) # Download earliest version as PDF if (downloadPapers and (versionsStr != "NONE")): earliestVersionURL = "http://biorxiv.org" + earlierVersions[0] + ".full.pdf" localDest = "biorxiv_earliest/" + bxDOI.replace('/', '_') + ".v1.pdf" if (not os.path.isfile(localDest)): try: print "Downloading " + earliestVersionURL + " into " + localDest urllib.urlretrieve(earliestVersionURL, localDest) time.sleep(10) except (urllib.ContentTooShortError, IOError), e: print "Error downloading " + earliestVersionURL + " " + str(sys.exc_info()) + " - " + repr(e) # THIS IS WHERE THE SCRIPT REALLY BEGINS (I PROMISE) newVersions = 0 publishedDOIs = 0 with open('biorxiv_dois.txt', 'r') as infile: for line in infile: # Reconstruct the link to each article page and process it lineA = line.split("\t") bxDOI = lineA[0].replace('doi.org/', '') atomPath = lineA[3].replace('.atom', '').replace('/biorxiv/', '') infoURL = 'http://biorxiv.org/content/' + atomPath + '.article-info' infoPage = cacheQuery(infoURL) processArticlePage(infoPage, infoURL, bxDOI) logfile.write(str(newVersions) + " articles have more than one version\n") logfile.write(str(publishedDOIs) + " articles have publication DOIs\n") sortedJournals = sorted(journalCounts.items(), key=operator.itemgetter(1)) sortedJournals.reverse() logfile.write("JOURNAL COUNTS\n") for item in sortedJournals: logfile.write(str(item[0]) + " " + str(item[1]) + "\n") logfile.close() outfile.close() smallout.close() allfile.close()