Github: Mass Downloader using python map

How to write a simple multi-threaded mass downloader using workerpool

Solution 3: Multi-threaded, using map

WorkerPool implements a map method which is similar to Python’s native map method. This is a convenient shortcut for when writing a custom Job class is more work than it’s worth.

# download3.py - Download many URLs using multiple threads, with the ``map`` method.
import os
import urllib
import workerpool

def download(url):
    url = url.strip()
    save_to = os.path.basename(url)
    urllib.urlretrieve(url, save_to)
    print "Downloaded %s" % url

# Initialize a pool, 5 threads in this case
pool = workerpool.WorkerPool(size=5)

# The ``download`` method will be called with a line from the second 
# parameter for each job.
pool.map(download, open("urls.txt").readlines())

# Send shutdown jobs to all threads, and wait until all the jobs have been completed
pool.shutdown()
pool.wait()

pycurl CurlMulti example

import pycurl
from cStringIO import StringIO

urls = [...] # list of urls
# reqs: List of individual requests.
# Each list element will be a 3-tuple of url (string), response string buffer
# (cStringIO.StringIO), and request handle (pycurl.Curl object).
reqs = [] 

# Build multi-request object.
m = pycurl.CurlMulti()
for url in urls: 
    response = StringIO()
    handle = pycurl.Curl()
    handle.setopt(pycurl.URL, url)
    handle.setopt(pycurl.WRITEFUNCTION, response.write)
    req = (url, response, handle)
    # Note that the handle must be added to the multi object
    # by reference to the req tuple (threading?).
    m.add_handle(req[2])
    reqs.append(req)

# Perform multi-request.
# This code copied from pycurl docs, modified to explicitly
# set num_handles before the outer while loop.
SELECT_TIMEOUT = 1.0
num_handles = len(reqs)
while num_handles:
    ret = m.select(SELECT_TIMEOUT)
    if ret == -1:
        continue
    while 1:
        ret, num_handles = m.perform()
        if ret != pycurl.E_CALL_MULTI_PERFORM: 
            break

for req in reqs:
    # req[1].getvalue() contains response content