If you are using Python 2.6 or higher, then you should get to know the multiprocessing module as soon as possible. It works around the GIL to give true multiprocessing capabilities to python. Here is an example to show you how to spider sites with several worker processes. Use of the logging module is imperative to debugging these multiprocess programs.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python

"""
Spider steam boycott group and tally who followed through and who didn't.
"""

import logging
import urllib2
from cStringIO import StringIO
from multiprocessing import Pool
from lxml.html import parse

def glean_games(url):
    logging.debug('Getting %s' % url)
    doc = parse(urllib2.urlopen(url)).getroot()
    game_elements = doc.cssselect('#mainContents h4')
    return [e.text_content() for e in game_elements]

def glean_users(url=None, html=None):
    if html is None:
        logging.debug('Getting %s' % url)
        doc = parse(urllib2.urlopen(url)).getroot()
    else:
        doc = parse(StringIO(html)).getroot()
    user_links = doc.cssselect(
    'a.linkFriend_offline, a.linkFriend_online, a.linkFriend_in-game')
    return [(link.text_content(), link.attrib['href']) for link in user_links]

def spider(url, pool_size=20):
    logging.debug('Getting %s' % url)
    response = urllib2.urlopen(url)
    html = response.read() # Necessary for mulitprocessing, needs to be pickleable
    group_page = parse(StringIO(html)).getroot()
    page_links = group_page.cssselect('.pageLinks a')
    page_count = page_links[-2].attrib['href'].split('=')[-1]
    urls = ['%s?p=%d' % (url, page) for page in xrange(2, int(page_count) + 1)]

    pool = Pool(pool_size)
    results = []
    results.append(pool.apply_async(glean_users, (), {'html': html}))
    results.extend([pool.apply_async(glean_users, (url,)) for url in urls])

    users = []
    for result in results:
        users.extend(result.get())

    logging.info('Found %d users!' % len(users))

    games = []
    for username, url in users:
        games.append((username, pool.apply_async(glean_games, (url + '/games',))))

    for username, result in games:
        games = result.get()
        yield username, games

def main():
    import sys
    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
    game = 'Call of Duty: Modern Warfare 2'
    has = []
    has_not = []
    for username, games in spider(
        'http://steamcommunity.com/groups/BOYCOTTMW2/members'):
        if game in games:
            logging.info('%s has %s' % (username, game))
            has.append(username)
        else:
            logging.info('%s has not' % (username))
            has_not.append(username)
    print '%d users have %s and %d do not.' % (len(has), game, len(has_not))

if __name__ == '__main__':
    main()
Posted by Tyler Lesmann on November 12, 2009 at 18:46
Tagged as: lxml multiprocessing python screen_scraping
Post a comment