+++ /dev/null
-"""This is a recursive web crawler. Don't go pointing this at random sites;
-it doesn't respect robots.txt and it is pretty brutal about how quickly it
-fetches pages.
-
-The code for this is very short; this is perhaps a good indication
-that this is making the most effective use of the primitves at hand.
-The fetch function does all the work of making http requests,
-searching for new urls, and dispatching new fetches. The GreenPool
-acts as sort of a job coordinator (and concurrency controller of
-course).
-"""
-from __future__ import with_statement
-
-from eventlet.green import urllib2
-import eventlet
-import re
-
-# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
-url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
-
-
-def fetch(url, seen, pool):
- """Fetch a url, stick any found urls into the seen set, and
- dispatch any new ones to the pool."""
- print("fetching", url)
- data = ''
- with eventlet.Timeout(5, False):
- data = urllib2.urlopen(url).read()
- for url_match in url_regex.finditer(data):
- new_url = url_match.group(0)
- # only send requests to eventlet.net so as not to destroy the internet
- if new_url not in seen and 'eventlet.net' in new_url:
- seen.add(new_url)
- # while this seems stack-recursive, it's actually not:
- # spawned greenthreads start their own stacks
- pool.spawn_n(fetch, new_url, seen, pool)
-
-
-def crawl(start_url):
- """Recursively crawl starting from *start_url*. Returns a set of
- urls that were found."""
- pool = eventlet.GreenPool()
- seen = set()
- fetch(start_url, seen, pool)
- pool.waitall()
- return seen
-
-seen = crawl("http://eventlet.net")
-print("I saw these urls:")
-print("\n".join(seen))