X-Git-Url: https://review.fuel-infra.org/gitweb?a=blobdiff_plain;f=eventlet%2Fexamples%2Frecursive_crawler.py;fp=eventlet%2Fexamples%2Frecursive_crawler.py;h=6ecf5b4e8a6c1f2d4af50b0a205087dbf6f81198;hb=376ff3bfe7071cc0793184a378c4e74508fb0d97;hp=0000000000000000000000000000000000000000;hpb=70992db4bef26426213a8eae488be377cdd655ae;p=packages%2Ftrusty%2Fpython-eventlet.git diff --git a/eventlet/examples/recursive_crawler.py b/eventlet/examples/recursive_crawler.py new file mode 100644 index 0000000..6ecf5b4 --- /dev/null +++ b/eventlet/examples/recursive_crawler.py @@ -0,0 +1,50 @@ +"""This is a recursive web crawler. Don't go pointing this at random sites; +it doesn't respect robots.txt and it is pretty brutal about how quickly it +fetches pages. + +The code for this is very short; this is perhaps a good indication +that this is making the most effective use of the primitves at hand. +The fetch function does all the work of making http requests, +searching for new urls, and dispatching new fetches. The GreenPool +acts as sort of a job coordinator (and concurrency controller of +course). +""" +from __future__ import with_statement + +from eventlet.green import urllib2 +import eventlet +import re + +# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls +url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') + + +def fetch(url, seen, pool): + """Fetch a url, stick any found urls into the seen set, and + dispatch any new ones to the pool.""" + print("fetching", url) + data = '' + with eventlet.Timeout(5, False): + data = urllib2.urlopen(url).read() + for url_match in url_regex.finditer(data): + new_url = url_match.group(0) + # only send requests to eventlet.net so as not to destroy the internet + if new_url not in seen and 'eventlet.net' in new_url: + seen.add(new_url) + # while this seems stack-recursive, it's actually not: + # spawned greenthreads start their own stacks + pool.spawn_n(fetch, new_url, seen, pool) + + +def crawl(start_url): + """Recursively crawl starting from *start_url*. Returns a set of + urls that were found.""" + pool = eventlet.GreenPool() + seen = set() + fetch(start_url, seen, pool) + pool.waitall() + return seen + +seen = crawl("http://eventlet.net") +print("I saw these urls:") +print("\n".join(seen))