python-eventlet/examples/producer_consumer.py

   1 """This is a recursive web crawler.  Don't go pointing this at random sites;
   2 it doesn't respect robots.txt and it is pretty brutal about how quickly it
   3 fetches pages.
   4
   5 This is a kind of "producer/consumer" example; the fetch function produces
   6 jobs, and the GreenPool itself is the consumer, farming out work concurrently.
   7 It's easier to write it this way rather than writing a standard consumer loop;
   8 GreenPool handles any exceptions raised and arranges so that there's a set
   9 number of "workers", so you don't have to write that tedious management code
  10 yourself.
  11 """
  12 from __future__ import with_statement
  13
  14 from eventlet.green import urllib2
  15 import eventlet
  16 import re
  17
  18 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  19 url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
  20
  21
  22 def fetch(url, outq):
  23     """Fetch a url and push any urls found into a queue."""
  24     print("fetching", url)
  25     data = ''
  26     with eventlet.Timeout(5, False):
  27         data = urllib2.urlopen(url).read()
  28     for url_match in url_regex.finditer(data):
  29         new_url = url_match.group(0)
  30         outq.put(new_url)
  31
  32
  33 def producer(start_url):
  34     """Recursively crawl starting from *start_url*.  Returns a set of
  35     urls that were found."""
  36     pool = eventlet.GreenPool()
  37     seen = set()
  38     q = eventlet.Queue()
  39     q.put(start_url)
  40     # keep looping if there are new urls, or workers that may produce more urls
  41     while True:
  42         while not q.empty():
  43             url = q.get()
  44             # limit requests to eventlet.net so we don't crash all over the internet
  45             if url not in seen and 'eventlet.net' in url:
  46                 seen.add(url)
  47                 pool.spawn_n(fetch, url, q)
  48         pool.waitall()
  49         if q.empty():
  50             break
  51
  52     return seen
  53
  54
  55 seen = producer("http://eventlet.net")
  56 print("I saw these urls:")
  57 print("\n".join(seen))