Marcin Benke
20 marca 2015
virtualenv dir
cd dir
source bin/activate
pip install beautifulsoup beautifulsoup4 django django-debug-toolbar
import urllib
response = urllib.urlopen('http://example.com')
pageText = response.read()
response.close()
print pageText
>>> urllib.urlencode((('k1','v1'),('k2','v2')))
'k1=v1&k2=v2'
>>> urllib.quote_plus('<ala ma kota&psa>')
'%3Cala+ma+kota%26psa%3E'
Nagłówki, autentykacja, ciasteczka,...
def urlopen(url):
request = urllib2.Request(url)
request.add_header('User-Agent', USER_AGENT)
request.add_header('Accept-encoding', 'gzip')
request.add_header('Connection', 'close')
response = urllib2.urlopen(request)
return response
try:
stoperFetch.start()
response = urlopen(url)
if response.headers.get('content-encoding', ) == 'gzip':
compressed = response.read()
pageText = gzip.GzipFile(fileobj=StringIO(compressed)).read()
response.close()
else:
pageText = response.read()
response.close()
return pageText
except urllib2.HTTPError, e:
if e.getcode() == 503:
warn('HTTPError: 503')
warn(unicode(e.info()))
print e.read()
raise HTTPForbidden
else:
warn('HTTPError')
Beautiful Soup: We called him Tortoise because he taught us.
``You didn't write that awful page. You're just trying to get some data out of it. Beautiful Soup is here to help. Since 2004, it's been saving programmers hours or days of work on quick-turnaround screen scraping projects.''
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were
three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
soup.title
# <title>The Dormouse's story</title>
soup.title.name
# u'title'
soup.title.string
# u'The Dormouse's story'
soup.title.parent.name
# u'head'
soup.p
# <p class="title"><b>The Dormouse's story</b></p>
soup.p['class']
# u'title'
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
[a.find(class_="link1" for a in soup.find_all(class_='col5')]
RE_PA = re.compile('^pa\d+')
soup = BeautifulSoup(html)
nodes = soup.findAll('a',id=RE_PA)
for node in contents:
if isinstance(node,NavigableString):
ad['lines'].append(node)
elif isinstance(node,Tag):
if node.name == 'span':
ad['displayurl']=concatText(node.contents)