# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
class ScraperItem(scrapy.Item):
# The source URL
url_from = scrapy.Field()
# The destination URL
url_to = scrapy.Field()
class UclaScraperSpider(CrawlSpider):
name = "ucla_scraper"
allowed_domains = ["ucla.edu"]
start_urls = ['http://www.ucla.edu/students/current-students']
custom_settings = {
'DEPTH_LIMIT': '2'}
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
for link in links:
# Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
is_allowed = False
for allowed_domain in self.allowed_domains:
if allowed_domain in link.url:
is_allowed = True
# If it is allowed, create a new item and add it to the list of found items
if is_allowed:
item = ScraperItem()
item['url_from'] = response.url
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items
Execute the Spider!
Run the spider from the command line in the parent directory of the spiders folder as follows:
scrapy crawl ucla_scraper -o links.csv -t csv
Done!
In response to COVID-19 the Sandbox is providing weekly open office hours for research consulting and questions.
Scrapy Tutorial
Install Scrapy
Setting up the project
Creating the Spider
Run this snippet in the your command line/terminal to create a spider called “ucla_scraper”:
Next open the file that was created in the “spiders” folder, it should be called “ucla_scraper.py”
Feel free to copy and paste the following code based off of a datablogger tutorial (https://www.data-blogger.com/2016/08/18/scraping-a-website-with-python-scrapy/)
Execute the Spider!
Run the spider from the command line in the parent directory of the spiders folder as follows:
Done!