--- /dev/null
+"""This is a recursive web crawler. Don't go pointing this at random sites;
+it doesn't respect robots.txt and it is pretty brutal about how quickly it
+fetches pages.
+
+The code for this is very short; this is perhaps a good indication
+that this is making the most effective use of the primitves at hand.
+The fetch function does all the work of making http requests,
+searching for new urls, and dispatching new fetches. The GreenPool
+acts as sort of a job coordinator (and concurrency controller of
+course).
+"""
+from __future__ import with_statement
+
+from eventlet.green import urllib2
+import eventlet
+import re
+
+# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
+url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
+
+
+def fetch(url, seen, pool):
+ """Fetch a url, stick any found urls into the seen set, and
+ dispatch any new ones to the pool."""
+ print("fetching", url)
+ data = ''
+ with eventlet.Timeout(5, False):
+ data = urllib2.urlopen(url).read()
+ for url_match in url_regex.finditer(data):
+ new_url = url_match.group(0)
+ # only send requests to eventlet.net so as not to destroy the internet
+ if new_url not in seen and 'eventlet.net' in new_url:
+ seen.add(new_url)
+ # while this seems stack-recursive, it's actually not:
+ # spawned greenthreads start their own stacks
+ pool.spawn_n(fetch, new_url, seen, pool)
+
+
+def crawl(start_url):
+ """Recursively crawl starting from *start_url*. Returns a set of
+ urls that were found."""
+ pool = eventlet.GreenPool()
+ seen = set()
+ fetch(start_url, seen, pool)
+ pool.waitall()
+ return seen
+
+seen = crawl("http://eventlet.net")
+print("I saw these urls:")
+print("\n".join(seen))