X-Git-Url: https://review.fuel-infra.org/gitweb?a=blobdiff_plain;f=eventlet%2Fexamples%2Frecursive_crawler.py;fp=eventlet%2Fexamples%2Frecursive_crawler.py;h=0000000000000000000000000000000000000000;hb=358bd9258c2b6d2ee74de4dfd07a5123107abad4;hp=6ecf5b4e8a6c1f2d4af50b0a205087dbf6f81198;hpb=376ff3bfe7071cc0793184a378c4e74508fb0d97;p=packages%2Ftrusty%2Fpython-eventlet.git diff --git a/eventlet/examples/recursive_crawler.py b/eventlet/examples/recursive_crawler.py deleted file mode 100644 index 6ecf5b4..0000000 --- a/eventlet/examples/recursive_crawler.py +++ /dev/null @@ -1,50 +0,0 @@ -"""This is a recursive web crawler. Don't go pointing this at random sites; -it doesn't respect robots.txt and it is pretty brutal about how quickly it -fetches pages. - -The code for this is very short; this is perhaps a good indication -that this is making the most effective use of the primitves at hand. -The fetch function does all the work of making http requests, -searching for new urls, and dispatching new fetches. The GreenPool -acts as sort of a job coordinator (and concurrency controller of -course). -""" -from __future__ import with_statement - -from eventlet.green import urllib2 -import eventlet -import re - -# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls -url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') - - -def fetch(url, seen, pool): - """Fetch a url, stick any found urls into the seen set, and - dispatch any new ones to the pool.""" - print("fetching", url) - data = '' - with eventlet.Timeout(5, False): - data = urllib2.urlopen(url).read() - for url_match in url_regex.finditer(data): - new_url = url_match.group(0) - # only send requests to eventlet.net so as not to destroy the internet - if new_url not in seen and 'eventlet.net' in new_url: - seen.add(new_url) - # while this seems stack-recursive, it's actually not: - # spawned greenthreads start their own stacks - pool.spawn_n(fetch, new_url, seen, pool) - - -def crawl(start_url): - """Recursively crawl starting from *start_url*. Returns a set of - urls that were found.""" - pool = eventlet.GreenPool() - seen = set() - fetch(start_url, seen, pool) - pool.waitall() - return seen - -seen = crawl("http://eventlet.net") -print("I saw these urls:") -print("\n".join(seen))