This is code to extract article metadata and PDF download links for articles from bioRxiv, as identified by entries in a file named biorxiv_dois.txt.

Here’s a sample article page.

#!/usr/bin/python
# -*- coding: utf-8 -*-

# Script to open, download, and parse every article page on bioRxiv
# specified in the file biorxiv_dois.txt (this should be all of them),
# extract relevant information, write it to a file, and download
# the earliest and latest uploaded preprint PDFs of the paper.

import codecs # Helps with character encodings
from selenium import webdriver # Web browser automation tools
from selenium.webdriver.common.keys import Keys # ditto
from bs4 import BeautifulSoup as bs # HTML parser
from slugify import slugify # Turns strings into nice filenames
import pickle # Used to save data (anything!) to a file and get it back later
import time # Includes the sleep() command, which pauses for X seconds
import os # Useful for file system operations
import sys # Random system functions
import urllib # Another library for downloading stuff from the web
import operator # Useful for sorting data in ascending or descing order

# Make sure we're using UTF-8
reload(sys)
sys.setdefaultencoding("utf-8")

# Set to True to re-download all PDFs
downloadPapers = False

# Open some files for output, logging, etc.
allfile = codecs.open("biorxiv_all_dois.txt", 'w', 'utf-8')
outfile = codecs.open("biorxiv_published_dois.txt", 'w', 'utf-8')
smallout = codecs.open("biorxiv_pub_dois.txt", 'w', 'utf-8')
logfile = codecs.open("scrape_log.txt", 'w', 'utf-8')

journalCounts = {}

def cacheQuery(query, forceUncache=False):
 
  # Function to open, download and cache (pickle) a specified website.
  # If the site is already cached, just return it.
  # If forceUncache is True, re-download the page even if it's already cached.

  print "querying " + query
  queryFile = 'articlePages/' + slugify(query)
  if ((not forceUncache) and os.path.isfile(queryFile)):
    data = pickle.load(open(queryFile, 'rb'))
  else:
    # Play nice
    time.sleep(1)
    # Start the mind-controlled browser
    chromedriver = "/usr/bin/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    browser = webdriver.Chrome(chromedriver)
    # Open the page in the browser
    browser.get(query)
    # Get the contents of the page
    data = browser.page_source
    # Save the contents to a file
    pickle.dump(data, open(queryFile, 'wb'))
    # Close the mind-controlled browser
    browser.quit()
  return data

def processArticlePage(page, query, bxDOI):
 
  # Function to parse an article page, e.g.,
  # http://biorxiv.org/content/early/2016/07/22/059899
  # given the contents of the page

  global outfile, newVersions, publishedDOIs, journalCounts
 
  # Load the page HTML into the BeautifulSoup parser
  html = bs(page, "lxml")

  # Get the page metadata
  meta = html.find('head')

  # Find the download link for the latest PDF
  downloadLink = "NONE"
  dLink = meta.find('meta', attrs={'name': 'citation_pdf_url'})
  if (dLink is not None):
    downloadLink = dLink.get('content')

  # Find the article abstract text in the page metadata
  abstract = "NONE"
  absElt = meta.find('meta', attrs={'name': 'DC.Description'})
  if (absElt is not None):
    abstract = absElt.get('content').replace("\n", "").strip()

  # Find the article's publication date (if known) in the metadata
  pubDate = "NONE"
  pubElt = meta.find('meta', attrs={'name': 'DC.Date'})
  if (absElt is not None):
    pubDate = pubElt.get('content')

  content = html.find('section', attrs={'id': 'section-content'})

  # If we can't find the download link in the metadata, search for it in the
  # content section of the page
  if (downloadLink == "NONE"):
    contentPanes = content.find_all('div', attrs={'class': 'pane-content'})
    for pane in contentPanes:
      pLink = pane.find('a')
      if ((pLink is not None) and (pLink.get('href').find('.full.pdf') >= 0)):
        downloadLink = pLink.get('href')

  # Find the publication journal and DOI
  jnlStr = "NONE"
  doiStr = "NONE"
  pub_info = content.find('div', attrs={'class': 'pub_jnl'})
  if (pub_info is not None):
    pubStr = pub_info.text.replace("\n", "").strip()
    if (pubStr.find('doi') > 0):
      publishedDOIs += 1
      pubA = pubStr.split(' doi: ')
      doiStr = pubA[-1]
      jnlStr = pubA[0].replace("Now published in ", "")

  # Parse the version upload history for the paper
  versionsStr = "NONE"
  earlierVersions = []
  versionsPanel = content.find('div', attrs={'class': 'pane-highwire-versions'})
  if (versionsPanel is not None):
    versions = versionsPanel.find_all('a', attrs={'class': 'hw-version-previous-link'}) 
    newVersions += 1
    for version in versions:
      versionLink = version.get('href')
      earlierVersions.append(versionLink)
    versionsStr = "|".join(earlierVersions)

  # Write all of the parsed data about the article to various output files
  alldata = [bxDOI, pubDate, jnlStr, doiStr, versionsStr]
  allfile.write("\t".join(alldata) + "\n")
  if (jnlStr != "NONE"):
    outdata = [bxDOI, query, downloadLink, pubDate, jnlStr, doiStr, versionsStr, abstract]
    smalldata = [bxDOI, jnlStr, doiStr]
    outfile.write("\t".join(outdata) + "\n")
    smallout.write("\t".join(smalldata) + "\n")

    if (jnlStr in journalCounts):
      journalCounts[jnlStr] += 1
    else:
      journalCounts[jnlStr] = 1

  # Download most recent version as PDF
  if (downloadPapers and (downloadLink != "NONE")):
    latestVersionURL = downloadLink
    localDest = "biorxiv_latest/" + bxDOI.replace('/', '_') + ".pdf"
    if (not os.path.isfile(localDest)):
      try:
        print "Downloading " + latestVersionURL + " into " + localDest
        urllib.urlretrieve(latestVersionURL, localDest)
        time.sleep(10)
      except (urllib.ContentTooShortError, IOError), e:
        print "Error downloading " + latestVersionURL + " " + str(sys.exc_info()) + " - " + repr(e)

  # Download earliest version as PDF
  if (downloadPapers and (versionsStr != "NONE")):
    earliestVersionURL = "http://biorxiv.org" + earlierVersions[0] + ".full.pdf"
    localDest = "biorxiv_earliest/" + bxDOI.replace('/', '_') + ".v1.pdf"
    if (not os.path.isfile(localDest)):
      try:
        print "Downloading " + earliestVersionURL + " into " + localDest
        urllib.urlretrieve(earliestVersionURL, localDest)
        time.sleep(10)
      except (urllib.ContentTooShortError, IOError), e:
        print "Error downloading " + earliestVersionURL + " " + str(sys.exc_info()) + " - " + repr(e)

# THIS IS WHERE THE SCRIPT REALLY BEGINS (I PROMISE)

newVersions = 0
publishedDOIs = 0

with open('biorxiv_dois.txt', 'r') as infile:
  for line in infile:
    # Reconstruct the link to each article page and process it
    lineA = line.split("\t")
    bxDOI = lineA[0].replace('doi.org/', '')
    atomPath = lineA[3].replace('.atom', '').replace('/biorxiv/', '')
    infoURL = 'http://biorxiv.org/content/' + atomPath + '.article-info'
    infoPage = cacheQuery(infoURL)
    processArticlePage(infoPage, infoURL, bxDOI)

logfile.write(str(newVersions) + " articles have more than one version\n")
logfile.write(str(publishedDOIs) + " articles have publication DOIs\n")

sortedJournals = sorted(journalCounts.items(), key=operator.itemgetter(1))
sortedJournals.reverse()

logfile.write("JOURNAL COUNTS\n")
for item in sortedJournals:
  logfile.write(str(item[0]) + " " + str(item[1]) + "\n")

logfile.close()
outfile.close()
smallout.close()
allfile.close()