I found a blog post today that gleans the names and messages from the Twitter search. As an exercise, I decided to rewrite this using mechanize and lxml. My code writes to the standard out instead of a file. The user could redirect the output for the same effect. Note: I am aware that Twitter has JSON, plus several apis, and using that would be easier than this. This is an exercise.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
import getopt                     
import sys                        
from mechanize import Browser, _mechanize
from lxml.html import parse              

baseurl = "http://search.twitter.com/search?lang=en&q="

def search_twitter(terms, pages=1):                    
    """                                                
    terms = a list of search terms                     
    pages(optional) = number of pages to retrive       

    returns a list of dictionaries
    """
    br = Browser()
    br.set_handle_robots(False)
    results = []
    response = br.open("".join([baseurl, "+".join(terms)]))
    while(pages > 0):
        doc = parse(response).getroot()
        for msg in doc.cssselect('div.msg'):
            name = msg.cssselect('a')[0].text_content()
            text = msg.cssselect('span')[0].text_content()
            text = text.replace(' (expand)', '')
            results.append({
                'name': name,
                'text': text,
            })
        try:
            response = br.follow_link(text='Older')
        except _mechanize.LinkNotFoundError:
            break # No more pages :(
        pages -= 1
    return results

if __name__ == '__main__':
    optlist, args = getopt.getopt(sys.argv[1:], 'p:', ['pages='])
    optd = dict(optlist)
    pages = 1
    if '-p' in optd:
        pages = int(optd['-p'])
    if '--pages' in optd:
        pages = int(optd['--pages'])
    if len(args) < 1:
        print """
        Usage: %s [-p] [--pages] search terms
            -p, --pages = number of pages to retrieve
        """ % sys.argv[0]
        raise SystemExit, 1
    results = search_twitter(args, pages)
    for result in results:
        print "%(name)-20s%(text)s" % result
Posted by Tyler Lesmann on January 14, 2009 at 15:16
Tagged as: lxml mechanize python screen_scraping
Post a comment