I found a blog post today that gleans the names and messages from the Twitter search. As an exercise, I decided to rewrite this using mechanize and lxml. My code writes to the standard out instead of a file. The user could redirect the output for the same effect. Note: I am aware that Twitter has JSON, plus several apis, and using that would be easier than this. This is an exercise.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | #!/usr/bin/env python import getopt import sys from mechanize import Browser, _mechanize from lxml.html import parse baseurl = "http://search.twitter.com/search?lang=en&q=" def search_twitter(terms, pages=1): """ terms = a list of search terms pages(optional) = number of pages to retrive returns a list of dictionaries """ br = Browser() br.set_handle_robots(False) results = [] response = br.open("".join([baseurl, "+".join(terms)])) while(pages > 0): doc = parse(response).getroot() for msg in doc.cssselect('div.msg'): name = msg.cssselect('a')[0].text_content() text = msg.cssselect('span')[0].text_content() text = text.replace(' (expand)', '') results.append({ 'name': name, 'text': text, }) try: response = br.follow_link(text='Older') except _mechanize.LinkNotFoundError: break # No more pages :( pages -= 1 return results if __name__ == '__main__': optlist, args = getopt.getopt(sys.argv[1:], 'p:', ['pages=']) optd = dict(optlist) pages = 1 if '-p' in optd: pages = int(optd['-p']) if '--pages' in optd: pages = int(optd['--pages']) if len(args) < 1: print """ Usage: %s [-p] [--pages] search terms -p, --pages = number of pages to retrieve """ % sys.argv[0] raise SystemExit, 1 results = search_twitter(args, pages) for result in results: print "%(name)-20s%(text)s" % result |
Posted by
on January 14, 2009
at 15:16
