notes python scraping
http://nbviewer.ipython.org/github/cs109/content/blob/master/lec_04_scraping.ipynb
get request
import requests
from pattern import web
from bs4 import BeautifulSoup
url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'
r = requests.get(url)
print r.url
with header (twitch)
headers = {'Accept': 'application/vnd.twitchtv.v2+json'}
hs_url = 'https://api.twitch.tv/kraken/streams?game=Hearthstone%3A%20Heroes%20of%20Warcraft'
r = requests.get(hs_url, headers = headers)
get soup
from urllib2 import urlopen
html = urlopen(url).read()
soup = BeautifulSoup(html)
Using bs4 (method)
bs = BeautifulSoup(r.text)
for movie in bs.findAll('td', 'title'):
title = movie.find('a').contents[0]
genres = movie.find('span', 'genre').findAll('a')
genres = [g.contents[0] for g in genres]
runtime = movie.find('span', 'runtime').contents[0]
rating = movie.find('span', 'value').contents[0]
print title, genres, runtime, rating
soup.find_all
can find tag name, attribute, text, and string
soup.find_all('b')
find_all(re.compile("^b"))
find_all("title")
find_all("p", "title")
find_all(id = "jlj")
find all url
for link in soup.find_all('a'):
print(link.get('href'))
All text
print(soup.get_text())
json
import json
r = requests.get(hs_url, headers = headers)
rjson = r.json()['streams']
Published
02 January 2014
Modified
3 August 2014