eventlet/examples/recursive_crawler.py

   1 """This is a recursive web crawler.  Don't go pointing this at random sites;
   2 it doesn't respect robots.txt and it is pretty brutal about how quickly it
   3 fetches pages.
   4
   5 The code for this is very short; this is perhaps a good indication
   6 that this is making the most effective use of the primitves at hand.
   7 The fetch function does all the work of making http requests,
   8 searching for new urls, and dispatching new fetches.  The GreenPool
   9 acts as sort of a job coordinator (and concurrency controller of
  10 course).
  11 """
  12 from __future__ import with_statement
  13
  14 from eventlet.green import urllib2
  15 import eventlet
  16 import re
  17
  18 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  19 url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
  20
  21
  22 def fetch(url, seen, pool):
  23     """Fetch a url, stick any found urls into the seen set, and
  24     dispatch any new ones to the pool."""
  25     print("fetching", url)
  26     data = ''
  27     with eventlet.Timeout(5, False):
  28         data = urllib2.urlopen(url).read()
  29     for url_match in url_regex.finditer(data):
  30         new_url = url_match.group(0)
  31         # only send requests to eventlet.net so as not to destroy the internet
  32         if new_url not in seen and 'eventlet.net' in new_url:
  33             seen.add(new_url)
  34             # while this seems stack-recursive, it's actually not:
  35             # spawned greenthreads start their own stacks
  36             pool.spawn_n(fetch, new_url, seen, pool)
  37
  38
  39 def crawl(start_url):
  40     """Recursively crawl starting from *start_url*.  Returns a set of
  41     urls that were found."""
  42     pool = eventlet.GreenPool()
  43     seen = set()
  44     fetch(start_url, seen, pool)
  45     pool.waitall()
  46     return seen
  47
  48 seen = crawl("http://eventlet.net")
  49 print("I saw these urls:")
  50 print("\n".join(seen))