X-Git-Url: https://review.fuel-infra.org/gitweb?a=blobdiff_plain;f=eventlet%2Fexamples%2Frecursive_crawler.py;fp=eventlet%2Fexamples%2Frecursive_crawler.py;h=0000000000000000000000000000000000000000;hb=358bd9258c2b6d2ee74de4dfd07a5123107abad4;hp=6ecf5b4e8a6c1f2d4af50b0a205087dbf6f81198;hpb=376ff3bfe7071cc0793184a378c4e74508fb0d97;p=packages%2Ftrusty%2Fpython-eventlet.git

diff --git a/eventlet/examples/recursive_crawler.py b/eventlet/examples/recursive_crawler.py
deleted file mode 100644
index 6ecf5b4..0000000
--- a/eventlet/examples/recursive_crawler.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""This is a recursive web crawler.  Don't go pointing this at random sites;
-it doesn't respect robots.txt and it is pretty brutal about how quickly it
-fetches pages.
-
-The code for this is very short; this is perhaps a good indication
-that this is making the most effective use of the primitves at hand.
-The fetch function does all the work of making http requests,
-searching for new urls, and dispatching new fetches.  The GreenPool
-acts as sort of a job coordinator (and concurrency controller of
-course).
-"""
-from __future__ import with_statement
-
-from eventlet.green import urllib2
-import eventlet
-import re
-
-# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
-url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
-
-
-def fetch(url, seen, pool):
-    """Fetch a url, stick any found urls into the seen set, and
-    dispatch any new ones to the pool."""
-    print("fetching", url)
-    data = ''
-    with eventlet.Timeout(5, False):
-        data = urllib2.urlopen(url).read()
-    for url_match in url_regex.finditer(data):
-        new_url = url_match.group(0)
-        # only send requests to eventlet.net so as not to destroy the internet
-        if new_url not in seen and 'eventlet.net' in new_url:
-            seen.add(new_url)
-            # while this seems stack-recursive, it's actually not:
-            # spawned greenthreads start their own stacks
-            pool.spawn_n(fetch, new_url, seen, pool)
-
-
-def crawl(start_url):
-    """Recursively crawl starting from *start_url*.  Returns a set of
-    urls that were found."""
-    pool = eventlet.GreenPool()
-    seen = set()
-    fetch(start_url, seen, pool)
-    pool.waitall()
-    return seen
-
-seen = crawl("http://eventlet.net")
-print("I saw these urls:")
-print("\n".join(seen))