Eric Rochester, Scholars’ Lab
bit.ly/erochest-screen-scraping
Read the fine print.
Wikipedia has a nice overview on the Web Scraping article.
Looking at you, Flash
And you, PDF
def get(url, params=None):
"""A basic utility to get and parse a web page."""
req = requests.get(url, params=params)
doc = lxml.html.fromstring(req.text)
return doc
def get_countries(base_url, doc):
"""Takes the document and returns (country, country_value)."""
for select in doc.cssselect('form select'):
if select.get('name') == COUNTRY:
for option in select.cssselect('option'):
yield (option.text, option.get('value'))
def get_country_data(url, country_code, page=0):
"""Page through the data for one country."""
doc = get(url, {PAGE: page, COUNTRY: country_code})
# Get the data for the current page, counting it as we go.
n = 0
for table_row in doc.cssselect('table tbody tr'):
n += 1
yield tuple( td.text for td in table_row.cssselect('td') )
# If this page has data, see if the next does too.
if n > 0:
for row in get_country_data(url, country_code, page + 1):
yield row
See the full source.