virtualenv

virtualenv dir
cd dir
source bin/activate
pip install beautifulsoup beautifulsoup4 django django-debug-toolbar

urllib

import urllib

response = urllib.urlopen('http://example.com')
pageText = response.read()
response.close()
print pageText

urlencode, quote

>>> urllib.urlencode((('k1','v1'),('k2','v2')))
'k1=v1&k2=v2'
>>> urllib.quote_plus('<ala ma kota&psa>')
'%3Cala+ma+kota%26psa%3E'

urllib2

Nagłówki, autentykacja, ciasteczka,...

def urlopen(url):
    request = urllib2.Request(url)
    request.add_header('User-Agent', USER_AGENT)
    request.add_header('Accept-encoding', 'gzip')
    request.add_header('Connection', 'close')
    response = urllib2.urlopen(request)
    return response

fetcher

    try:
      stoperFetch.start()
      response = urlopen(url)
      if response.headers.get('content-encoding', ) == 'gzip':
          compressed = response.read()
          pageText = gzip.GzipFile(fileobj=StringIO(compressed)).read()
          response.close()
      else:
          pageText = response.read()
          response.close()
      return pageText
    except urllib2.HTTPError, e:
        if e.getcode() == 503:
            warn('HTTPError: 503')
            warn(unicode(e.info()))
            print e.read()
            raise HTTPForbidden
        else:
            warn('HTTPError')

Ćwiczenia

Ściągnąć z PKW stronę z listą gmin http://prezydent2010.pkw.gov.pl/PZT/PL/WYN/W/index.htm
Dla chętnych: ściągnąć z Google stronę wyszukiwania tak żeby dało się odczytać reklamy (używając kompresji gzip)

Beautifulsoup

Beautiful Soup: We called him Tortoise because he taught us.

``You didn't write that awful page. You're just trying to get some data out of it. Beautiful Soup is here to help. Since 2004, it's been saving programmers hours or days of work on quick-turnaround screen scraping projects.''

Alice

html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were
three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html)

Nawigacja

soup.title
# <title>The Dormouse's story</title>

soup.title.name
# u'title'

soup.title.string
# u'The Dormouse's story'

soup.title.parent.name
# u'head'

soup.p
# <p class="title"><b>The Dormouse's story</b></p>

soup.p['class']
# u'title'

soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

Wyszukiwanie

soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

Jeszcze przykłady

[a.find(class_="link1" for a in soup.find_all(class_='col5')]

  RE_PA = re.compile('^pa\d+')
  soup = BeautifulSoup(html)
  nodes = soup.findAll('a',id=RE_PA)

  for node in contents:
    if isinstance(node,NavigableString):
      ad['lines'].append(node)
    elif isinstance(node,Tag):
      if node.name == 'span':
        ad['displayurl']=concatText(node.contents)

Ćwiczenia

W ściągniętej stronie PKW znaleźć linki do województw
w województwie do powiatów
w powiecie do gmin
Dla chętnych: w stronie wyszukiwania z Google znaleźć reklamy
dla każdej reklamy wypisać: tytuł, treść i URL celu

Aplikacje WWW

virtualenv

urllib

urlencode, quote

urllib2

fetcher

Ćwiczenia

Beautifulsoup

Alice

Nawigacja

Wyszukiwanie

Jeszcze przykłady

Ćwiczenia