1 """This is a recursive web crawler. Don't go pointing this at random sites;
2 it doesn't respect robots.txt and it is pretty brutal about how quickly it
5 The code for this is very short; this is perhaps a good indication
6 that this is making the most effective use of the primitves at hand.
7 The fetch function does all the work of making http requests,
8 searching for new urls, and dispatching new fetches. The GreenPool
9 acts as sort of a job coordinator (and concurrency controller of
12 from __future__ import with_statement
14 from eventlet.green import urllib2
18 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
19 url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
22 def fetch(url, seen, pool):
23 """Fetch a url, stick any found urls into the seen set, and
24 dispatch any new ones to the pool."""
25 print("fetching", url)
27 with eventlet.Timeout(5, False):
28 data = urllib2.urlopen(url).read()
29 for url_match in url_regex.finditer(data):
30 new_url = url_match.group(0)
31 # only send requests to eventlet.net so as not to destroy the internet
32 if new_url not in seen and 'eventlet.net' in new_url:
34 # while this seems stack-recursive, it's actually not:
35 # spawned greenthreads start their own stacks
36 pool.spawn_n(fetch, new_url, seen, pool)
40 """Recursively crawl starting from *start_url*. Returns a set of
41 urls that were found."""
42 pool = eventlet.GreenPool()
44 fetch(start_url, seen, pool)
48 seen = crawl("http://eventlet.net")
49 print("I saw these urls:")
50 print("\n".join(seen))