1 """This is a recursive web crawler. Don't go pointing this at random sites;
2 it doesn't respect robots.txt and it is pretty brutal about how quickly it
5 This is a kind of "producer/consumer" example; the fetch function produces
6 jobs, and the GreenPool itself is the consumer, farming out work concurrently.
7 It's easier to write it this way rather than writing a standard consumer loop;
8 GreenPool handles any exceptions raised and arranges so that there's a set
9 number of "workers", so you don't have to write that tedious management code
12 from __future__ import with_statement
14 from eventlet.green import urllib2
18 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
19 url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
23 """Fetch a url and push any urls found into a queue."""
24 print("fetching", url)
26 with eventlet.Timeout(5, False):
27 data = urllib2.urlopen(url).read()
28 for url_match in url_regex.finditer(data):
29 new_url = url_match.group(0)
33 def producer(start_url):
34 """Recursively crawl starting from *start_url*. Returns a set of
35 urls that were found."""
36 pool = eventlet.GreenPool()
40 # keep looping if there are new urls, or workers that may produce more urls
44 # limit requests to eventlet.net so we don't crash all over the internet
45 if url not in seen and 'eventlet.net' in url:
47 pool.spawn_n(fetch, url, q)
55 seen = producer("http://eventlet.net")
56 print("I saw these urls:")
57 print("\n".join(seen))