mirror of
https://git.yoctoproject.org/poky
synced 2026-04-26 18:32:13 +02:00
wic: Remove 3rdparty/urlgrabber
wic doesn't use it, so remove it. (From OE-Core rev: 00dcdb29c89634ab267d328eb00f8eb70c696655) Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
This commit is contained in:
committed by
Richard Purdie
parent
ba19b60fd2
commit
c9b5ea0873
@@ -1,53 +0,0 @@
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Library General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
|
||||
|
||||
# $Id: __init__.py,v 1.20 2006/09/22 00:58:55 mstenner Exp $
|
||||
|
||||
"""A high-level cross-protocol url-grabber.
|
||||
|
||||
Using urlgrabber, data can be fetched in three basic ways:
|
||||
|
||||
urlgrab(url) copy the file to the local filesystem
|
||||
urlopen(url) open the remote file and return a file object
|
||||
(like urllib2.urlopen)
|
||||
urlread(url) return the contents of the file as a string
|
||||
|
||||
When using these functions (or methods), urlgrabber supports the
|
||||
following features:
|
||||
|
||||
* identical behavior for http://, ftp://, and file:// urls
|
||||
* http keepalive - faster downloads of many files by using
|
||||
only a single connection
|
||||
* byte ranges - fetch only a portion of the file
|
||||
* reget - for a urlgrab, resume a partial download
|
||||
* progress meters - the ability to report download progress
|
||||
automatically, even when using urlopen!
|
||||
* throttling - restrict bandwidth usage
|
||||
* retries - automatically retry a download if it fails. The
|
||||
number of retries and failure types are configurable.
|
||||
* authenticated server access for http and ftp
|
||||
* proxy support - support for authenticated http and ftp proxies
|
||||
* mirror groups - treat a list of mirrors as a single source,
|
||||
automatically switching mirrors if there is a failure.
|
||||
"""
|
||||
|
||||
__version__ = '3.1.0'
|
||||
__date__ = '2006/09/21'
|
||||
__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
|
||||
'Ryan Tomayko <rtomayko@naeblis.cx>'
|
||||
__url__ = 'http://linux.duke.edu/projects/urlgrabber/'
|
||||
|
||||
from grabber import urlgrab, urlopen, urlread
|
||||
@@ -1,463 +0,0 @@
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place, Suite 330,
|
||||
# Boston, MA 02111-1307 USA
|
||||
|
||||
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
||||
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
|
||||
|
||||
# $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $
|
||||
|
||||
import os
|
||||
import stat
|
||||
import urllib
|
||||
import urllib2
|
||||
import rfc822
|
||||
|
||||
DEBUG = None
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError, msg:
|
||||
from StringIO import StringIO
|
||||
|
||||
class RangeError(IOError):
|
||||
"""Error raised when an unsatisfiable range is requested."""
|
||||
pass
|
||||
|
||||
class HTTPRangeHandler(urllib2.BaseHandler):
|
||||
"""Handler that enables HTTP Range headers.
|
||||
|
||||
This was extremely simple. The Range header is a HTTP feature to
|
||||
begin with so all this class does is tell urllib2 that the
|
||||
"206 Partial Content" reponse from the HTTP server is what we
|
||||
expected.
|
||||
|
||||
Example:
|
||||
import urllib2
|
||||
import byterange
|
||||
|
||||
range_handler = range.HTTPRangeHandler()
|
||||
opener = urllib2.build_opener(range_handler)
|
||||
|
||||
# install it
|
||||
urllib2.install_opener(opener)
|
||||
|
||||
# create Request and set Range header
|
||||
req = urllib2.Request('http://www.python.org/')
|
||||
req.header['Range'] = 'bytes=30-50'
|
||||
f = urllib2.urlopen(req)
|
||||
"""
|
||||
|
||||
def http_error_206(self, req, fp, code, msg, hdrs):
|
||||
# 206 Partial Content Response
|
||||
r = urllib.addinfourl(fp, hdrs, req.get_full_url())
|
||||
r.code = code
|
||||
r.msg = msg
|
||||
return r
|
||||
|
||||
def http_error_416(self, req, fp, code, msg, hdrs):
|
||||
# HTTP's Range Not Satisfiable error
|
||||
raise RangeError('Requested Range Not Satisfiable')
|
||||
|
||||
class HTTPSRangeHandler(HTTPRangeHandler):
|
||||
""" Range Header support for HTTPS. """
|
||||
|
||||
def https_error_206(self, req, fp, code, msg, hdrs):
|
||||
return self.http_error_206(req, fp, code, msg, hdrs)
|
||||
|
||||
def https_error_416(self, req, fp, code, msg, hdrs):
|
||||
self.https_error_416(req, fp, code, msg, hdrs)
|
||||
|
||||
class RangeableFileObject:
|
||||
"""File object wrapper to enable raw range handling.
|
||||
This was implemented primarilary for handling range
|
||||
specifications for file:// urls. This object effectively makes
|
||||
a file object look like it consists only of a range of bytes in
|
||||
the stream.
|
||||
|
||||
Examples:
|
||||
# expose 10 bytes, starting at byte position 20, from
|
||||
# /etc/aliases.
|
||||
>>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
|
||||
# seek seeks within the range (to position 23 in this case)
|
||||
>>> fo.seek(3)
|
||||
# tell tells where your at _within the range_ (position 3 in
|
||||
# this case)
|
||||
>>> fo.tell()
|
||||
# read EOFs if an attempt is made to read past the last
|
||||
# byte in the range. the following will return only 7 bytes.
|
||||
>>> fo.read(30)
|
||||
"""
|
||||
|
||||
def __init__(self, fo, rangetup):
|
||||
"""Create a RangeableFileObject.
|
||||
fo -- a file like object. only the read() method need be
|
||||
supported but supporting an optimized seek() is
|
||||
preferable.
|
||||
rangetup -- a (firstbyte,lastbyte) tuple specifying the range
|
||||
to work over.
|
||||
The file object provided is assumed to be at byte offset 0.
|
||||
"""
|
||||
self.fo = fo
|
||||
(self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
|
||||
self.realpos = 0
|
||||
self._do_seek(self.firstbyte)
|
||||
|
||||
def __getattr__(self, name):
|
||||
"""This effectively allows us to wrap at the instance level.
|
||||
Any attribute not found in _this_ object will be searched for
|
||||
in self.fo. This includes methods."""
|
||||
if hasattr(self.fo, name):
|
||||
return getattr(self.fo, name)
|
||||
raise AttributeError, name
|
||||
|
||||
def tell(self):
|
||||
"""Return the position within the range.
|
||||
This is different from fo.seek in that position 0 is the
|
||||
first byte position of the range tuple. For example, if
|
||||
this object was created with a range tuple of (500,899),
|
||||
tell() will return 0 when at byte position 500 of the file.
|
||||
"""
|
||||
return (self.realpos - self.firstbyte)
|
||||
|
||||
def seek(self,offset,whence=0):
|
||||
"""Seek within the byte range.
|
||||
Positioning is identical to that described under tell().
|
||||
"""
|
||||
assert whence in (0, 1, 2)
|
||||
if whence == 0: # absolute seek
|
||||
realoffset = self.firstbyte + offset
|
||||
elif whence == 1: # relative seek
|
||||
realoffset = self.realpos + offset
|
||||
elif whence == 2: # absolute from end of file
|
||||
# XXX: are we raising the right Error here?
|
||||
raise IOError('seek from end of file not supported.')
|
||||
|
||||
# do not allow seek past lastbyte in range
|
||||
if self.lastbyte and (realoffset >= self.lastbyte):
|
||||
realoffset = self.lastbyte
|
||||
|
||||
self._do_seek(realoffset - self.realpos)
|
||||
|
||||
def read(self, size=-1):
|
||||
"""Read within the range.
|
||||
This method will limit the size read based on the range.
|
||||
"""
|
||||
size = self._calc_read_size(size)
|
||||
rslt = self.fo.read(size)
|
||||
self.realpos += len(rslt)
|
||||
return rslt
|
||||
|
||||
def readline(self, size=-1):
|
||||
"""Read lines within the range.
|
||||
This method will limit the size read based on the range.
|
||||
"""
|
||||
size = self._calc_read_size(size)
|
||||
rslt = self.fo.readline(size)
|
||||
self.realpos += len(rslt)
|
||||
return rslt
|
||||
|
||||
def _calc_read_size(self, size):
|
||||
"""Handles calculating the amount of data to read based on
|
||||
the range.
|
||||
"""
|
||||
if self.lastbyte:
|
||||
if size > -1:
|
||||
if ((self.realpos + size) >= self.lastbyte):
|
||||
size = (self.lastbyte - self.realpos)
|
||||
else:
|
||||
size = (self.lastbyte - self.realpos)
|
||||
return size
|
||||
|
||||
def _do_seek(self,offset):
|
||||
"""Seek based on whether wrapped object supports seek().
|
||||
offset is relative to the current position (self.realpos).
|
||||
"""
|
||||
assert offset >= 0
|
||||
if not hasattr(self.fo, 'seek'):
|
||||
self._poor_mans_seek(offset)
|
||||
else:
|
||||
self.fo.seek(self.realpos + offset)
|
||||
self.realpos+= offset
|
||||
|
||||
def _poor_mans_seek(self,offset):
|
||||
"""Seek by calling the wrapped file objects read() method.
|
||||
This is used for file like objects that do not have native
|
||||
seek support. The wrapped objects read() method is called
|
||||
to manually seek to the desired position.
|
||||
offset -- read this number of bytes from the wrapped
|
||||
file object.
|
||||
raise RangeError if we encounter EOF before reaching the
|
||||
specified offset.
|
||||
"""
|
||||
pos = 0
|
||||
bufsize = 1024
|
||||
while pos < offset:
|
||||
if (pos + bufsize) > offset:
|
||||
bufsize = offset - pos
|
||||
buf = self.fo.read(bufsize)
|
||||
if len(buf) != bufsize:
|
||||
raise RangeError('Requested Range Not Satisfiable')
|
||||
pos+= bufsize
|
||||
|
||||
class FileRangeHandler(urllib2.FileHandler):
|
||||
"""FileHandler subclass that adds Range support.
|
||||
This class handles Range headers exactly like an HTTP
|
||||
server would.
|
||||
"""
|
||||
def open_local_file(self, req):
|
||||
import mimetypes
|
||||
import mimetools
|
||||
host = req.get_host()
|
||||
file = req.get_selector()
|
||||
localfile = urllib.url2pathname(file)
|
||||
stats = os.stat(localfile)
|
||||
size = stats[stat.ST_SIZE]
|
||||
modified = rfc822.formatdate(stats[stat.ST_MTIME])
|
||||
mtype = mimetypes.guess_type(file)[0]
|
||||
if host:
|
||||
host, port = urllib.splitport(host)
|
||||
if port or socket.gethostbyname(host) not in self.get_names():
|
||||
raise urllib2.URLError('file not on local host')
|
||||
fo = open(localfile,'rb')
|
||||
brange = req.headers.get('Range',None)
|
||||
brange = range_header_to_tuple(brange)
|
||||
assert brange != ()
|
||||
if brange:
|
||||
(fb,lb) = brange
|
||||
if lb == '': lb = size
|
||||
if fb < 0 or fb > size or lb > size:
|
||||
raise RangeError('Requested Range Not Satisfiable')
|
||||
size = (lb - fb)
|
||||
fo = RangeableFileObject(fo, (fb,lb))
|
||||
headers = mimetools.Message(StringIO(
|
||||
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
|
||||
(mtype or 'text/plain', size, modified)))
|
||||
return urllib.addinfourl(fo, headers, 'file:'+file)
|
||||
|
||||
|
||||
# FTP Range Support
|
||||
# Unfortunately, a large amount of base FTP code had to be copied
|
||||
# from urllib and urllib2 in order to insert the FTP REST command.
|
||||
# Code modifications for range support have been commented as
|
||||
# follows:
|
||||
# -- range support modifications start/end here
|
||||
|
||||
from urllib import splitport, splituser, splitpasswd, splitattr, \
|
||||
unquote, addclosehook, addinfourl
|
||||
import ftplib
|
||||
import socket
|
||||
import sys
|
||||
import ftplib
|
||||
import mimetypes
|
||||
import mimetools
|
||||
|
||||
class FTPRangeHandler(urllib2.FTPHandler):
|
||||
def ftp_open(self, req):
|
||||
host = req.get_host()
|
||||
if not host:
|
||||
raise IOError, ('ftp error', 'no host given')
|
||||
host, port = splitport(host)
|
||||
if port is None:
|
||||
port = ftplib.FTP_PORT
|
||||
|
||||
# username/password handling
|
||||
user, host = splituser(host)
|
||||
if user:
|
||||
user, passwd = splitpasswd(user)
|
||||
else:
|
||||
passwd = None
|
||||
host = unquote(host)
|
||||
user = unquote(user or '')
|
||||
passwd = unquote(passwd or '')
|
||||
|
||||
try:
|
||||
host = socket.gethostbyname(host)
|
||||
except socket.error, msg:
|
||||
raise urllib2.URLError(msg)
|
||||
path, attrs = splitattr(req.get_selector())
|
||||
dirs = path.split('/')
|
||||
dirs = map(unquote, dirs)
|
||||
dirs, file = dirs[:-1], dirs[-1]
|
||||
if dirs and not dirs[0]:
|
||||
dirs = dirs[1:]
|
||||
try:
|
||||
fw = self.connect_ftp(user, passwd, host, port, dirs)
|
||||
type = file and 'I' or 'D'
|
||||
for attr in attrs:
|
||||
attr, value = splitattr(attr)
|
||||
if attr.lower() == 'type' and \
|
||||
value in ('a', 'A', 'i', 'I', 'd', 'D'):
|
||||
type = value.upper()
|
||||
|
||||
# -- range support modifications start here
|
||||
rest = None
|
||||
range_tup = range_header_to_tuple(req.headers.get('Range',None))
|
||||
assert range_tup != ()
|
||||
if range_tup:
|
||||
(fb,lb) = range_tup
|
||||
if fb > 0: rest = fb
|
||||
# -- range support modifications end here
|
||||
|
||||
fp, retrlen = fw.retrfile(file, type, rest)
|
||||
|
||||
# -- range support modifications start here
|
||||
if range_tup:
|
||||
(fb,lb) = range_tup
|
||||
if lb == '':
|
||||
if retrlen is None or retrlen == 0:
|
||||
raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
|
||||
lb = retrlen
|
||||
retrlen = lb - fb
|
||||
if retrlen < 0:
|
||||
# beginning of range is larger than file
|
||||
raise RangeError('Requested Range Not Satisfiable')
|
||||
else:
|
||||
retrlen = lb - fb
|
||||
fp = RangeableFileObject(fp, (0,retrlen))
|
||||
# -- range support modifications end here
|
||||
|
||||
headers = ""
|
||||
mtype = mimetypes.guess_type(req.get_full_url())[0]
|
||||
if mtype:
|
||||
headers += "Content-Type: %s\n" % mtype
|
||||
if retrlen is not None and retrlen >= 0:
|
||||
headers += "Content-Length: %d\n" % retrlen
|
||||
sf = StringIO(headers)
|
||||
headers = mimetools.Message(sf)
|
||||
return addinfourl(fp, headers, req.get_full_url())
|
||||
except ftplib.all_errors, msg:
|
||||
raise IOError, ('ftp error', msg), sys.exc_info()[2]
|
||||
|
||||
def connect_ftp(self, user, passwd, host, port, dirs):
|
||||
fw = ftpwrapper(user, passwd, host, port, dirs)
|
||||
return fw
|
||||
|
||||
class ftpwrapper(urllib.ftpwrapper):
|
||||
# range support note:
|
||||
# this ftpwrapper code is copied directly from
|
||||
# urllib. The only enhancement is to add the rest
|
||||
# argument and pass it on to ftp.ntransfercmd
|
||||
def retrfile(self, file, type, rest=None):
|
||||
self.endtransfer()
|
||||
if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
|
||||
else: cmd = 'TYPE ' + type; isdir = 0
|
||||
try:
|
||||
self.ftp.voidcmd(cmd)
|
||||
except ftplib.all_errors:
|
||||
self.init()
|
||||
self.ftp.voidcmd(cmd)
|
||||
conn = None
|
||||
if file and not isdir:
|
||||
# Use nlst to see if the file exists at all
|
||||
try:
|
||||
self.ftp.nlst(file)
|
||||
except ftplib.error_perm, reason:
|
||||
raise IOError, ('ftp error', reason), sys.exc_info()[2]
|
||||
# Restore the transfer mode!
|
||||
self.ftp.voidcmd(cmd)
|
||||
# Try to retrieve as a file
|
||||
try:
|
||||
cmd = 'RETR ' + file
|
||||
conn = self.ftp.ntransfercmd(cmd, rest)
|
||||
except ftplib.error_perm, reason:
|
||||
if str(reason)[:3] == '501':
|
||||
# workaround for REST not supported error
|
||||
fp, retrlen = self.retrfile(file, type)
|
||||
fp = RangeableFileObject(fp, (rest,''))
|
||||
return (fp, retrlen)
|
||||
elif str(reason)[:3] != '550':
|
||||
raise IOError, ('ftp error', reason), sys.exc_info()[2]
|
||||
if not conn:
|
||||
# Set transfer mode to ASCII!
|
||||
self.ftp.voidcmd('TYPE A')
|
||||
# Try a directory listing
|
||||
if file: cmd = 'LIST ' + file
|
||||
else: cmd = 'LIST'
|
||||
conn = self.ftp.ntransfercmd(cmd)
|
||||
self.busy = 1
|
||||
# Pass back both a suitably decorated object and a retrieval length
|
||||
return (addclosehook(conn[0].makefile('rb'),
|
||||
self.endtransfer), conn[1])
|
||||
|
||||
|
||||
####################################################################
|
||||
# Range Tuple Functions
|
||||
# XXX: These range tuple functions might go better in a class.
|
||||
|
||||
_rangere = None
|
||||
def range_header_to_tuple(range_header):
|
||||
"""Get a (firstbyte,lastbyte) tuple from a Range header value.
|
||||
|
||||
Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
|
||||
function pulls the firstbyte and lastbyte values and returns
|
||||
a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
|
||||
the header value, it is returned as an empty string in the
|
||||
tuple.
|
||||
|
||||
Return None if range_header is None
|
||||
Return () if range_header does not conform to the range spec
|
||||
pattern.
|
||||
|
||||
"""
|
||||
global _rangere
|
||||
if range_header is None: return None
|
||||
if _rangere is None:
|
||||
import re
|
||||
_rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
|
||||
match = _rangere.match(range_header)
|
||||
if match:
|
||||
tup = range_tuple_normalize(match.group(1,2))
|
||||
if tup and tup[1]:
|
||||
tup = (tup[0],tup[1]+1)
|
||||
return tup
|
||||
return ()
|
||||
|
||||
def range_tuple_to_header(range_tup):
|
||||
"""Convert a range tuple to a Range header value.
|
||||
Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
|
||||
if no range is needed.
|
||||
"""
|
||||
if range_tup is None: return None
|
||||
range_tup = range_tuple_normalize(range_tup)
|
||||
if range_tup:
|
||||
if range_tup[1]:
|
||||
range_tup = (range_tup[0],range_tup[1] - 1)
|
||||
return 'bytes=%s-%s' % range_tup
|
||||
|
||||
def range_tuple_normalize(range_tup):
|
||||
"""Normalize a (first_byte,last_byte) range tuple.
|
||||
Return a tuple whose first element is guaranteed to be an int
|
||||
and whose second element will be '' (meaning: the last byte) or
|
||||
an int. Finally, return None if the normalized tuple == (0,'')
|
||||
as that is equivelant to retrieving the entire file.
|
||||
"""
|
||||
if range_tup is None: return None
|
||||
# handle first byte
|
||||
fb = range_tup[0]
|
||||
if fb in (None,''): fb = 0
|
||||
else: fb = int(fb)
|
||||
# handle last byte
|
||||
try: lb = range_tup[1]
|
||||
except IndexError: lb = ''
|
||||
else:
|
||||
if lb is None: lb = ''
|
||||
elif lb != '': lb = int(lb)
|
||||
# check if range is over the entire file
|
||||
if (fb,lb) == (0,''): return None
|
||||
# check that the range is valid
|
||||
if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
|
||||
return (fb,lb)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,617 +0,0 @@
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place, Suite 330,
|
||||
# Boston, MA 02111-1307 USA
|
||||
|
||||
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
||||
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
|
||||
|
||||
"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
|
||||
|
||||
>>> import urllib2
|
||||
>>> from keepalive import HTTPHandler
|
||||
>>> keepalive_handler = HTTPHandler()
|
||||
>>> opener = urllib2.build_opener(keepalive_handler)
|
||||
>>> urllib2.install_opener(opener)
|
||||
>>>
|
||||
>>> fo = urllib2.urlopen('http://www.python.org')
|
||||
|
||||
If a connection to a given host is requested, and all of the existing
|
||||
connections are still in use, another connection will be opened. If
|
||||
the handler tries to use an existing connection but it fails in some
|
||||
way, it will be closed and removed from the pool.
|
||||
|
||||
To remove the handler, simply re-run build_opener with no arguments, and
|
||||
install that opener.
|
||||
|
||||
You can explicitly close connections by using the close_connection()
|
||||
method of the returned file-like object (described below) or you can
|
||||
use the handler methods:
|
||||
|
||||
close_connection(host)
|
||||
close_all()
|
||||
open_connections()
|
||||
|
||||
NOTE: using the close_connection and close_all methods of the handler
|
||||
should be done with care when using multiple threads.
|
||||
* there is nothing that prevents another thread from creating new
|
||||
connections immediately after connections are closed
|
||||
* no checks are done to prevent in-use connections from being closed
|
||||
|
||||
>>> keepalive_handler.close_all()
|
||||
|
||||
EXTRA ATTRIBUTES AND METHODS
|
||||
|
||||
Upon a status of 200, the object returned has a few additional
|
||||
attributes and methods, which should not be used if you want to
|
||||
remain consistent with the normal urllib2-returned objects:
|
||||
|
||||
close_connection() - close the connection to the host
|
||||
readlines() - you know, readlines()
|
||||
status - the return status (ie 404)
|
||||
reason - english translation of status (ie 'File not found')
|
||||
|
||||
If you want the best of both worlds, use this inside an
|
||||
AttributeError-catching try:
|
||||
|
||||
>>> try: status = fo.status
|
||||
>>> except AttributeError: status = None
|
||||
|
||||
Unfortunately, these are ONLY there if status == 200, so it's not
|
||||
easy to distinguish between non-200 responses. The reason is that
|
||||
urllib2 tries to do clever things with error codes 301, 302, 401,
|
||||
and 407, and it wraps the object upon return.
|
||||
|
||||
For python versions earlier than 2.4, you can avoid this fancy error
|
||||
handling by setting the module-level global HANDLE_ERRORS to zero.
|
||||
You see, prior to 2.4, it's the HTTP Handler's job to determine what
|
||||
to handle specially, and what to just pass up. HANDLE_ERRORS == 0
|
||||
means "pass everything up". In python 2.4, however, this job no
|
||||
longer belongs to the HTTP Handler and is now done by a NEW handler,
|
||||
HTTPErrorProcessor. Here's the bottom line:
|
||||
|
||||
python version < 2.4
|
||||
HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
|
||||
errors
|
||||
HANDLE_ERRORS == 0 pass everything up, error processing is
|
||||
left to the calling code
|
||||
python version >= 2.4
|
||||
HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
|
||||
HANDLE_ERRORS == 0 (default) pass everything up, let the
|
||||
other handlers (specifically,
|
||||
HTTPErrorProcessor) decide what to do
|
||||
|
||||
In practice, setting the variable either way makes little difference
|
||||
in python 2.4, so for the most consistent behavior across versions,
|
||||
you probably just want to use the defaults, which will give you
|
||||
exceptions on errors.
|
||||
|
||||
"""
|
||||
|
||||
# $Id: keepalive.py,v 1.16 2006/09/22 00:58:05 mstenner Exp $
|
||||
|
||||
import urllib2
|
||||
import httplib
|
||||
import socket
|
||||
import thread
|
||||
|
||||
DEBUG = None
|
||||
|
||||
import sslfactory
|
||||
|
||||
import sys
|
||||
if sys.version_info < (2, 4): HANDLE_ERRORS = 1
|
||||
else: HANDLE_ERRORS = 0
|
||||
|
||||
class ConnectionManager:
|
||||
"""
|
||||
The connection manager must be able to:
|
||||
* keep track of all existing
|
||||
"""
|
||||
def __init__(self):
|
||||
self._lock = thread.allocate_lock()
|
||||
self._hostmap = {} # map hosts to a list of connections
|
||||
self._connmap = {} # map connections to host
|
||||
self._readymap = {} # map connection to ready state
|
||||
|
||||
def add(self, host, connection, ready):
|
||||
self._lock.acquire()
|
||||
try:
|
||||
if not self._hostmap.has_key(host): self._hostmap[host] = []
|
||||
self._hostmap[host].append(connection)
|
||||
self._connmap[connection] = host
|
||||
self._readymap[connection] = ready
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
def remove(self, connection):
|
||||
self._lock.acquire()
|
||||
try:
|
||||
try:
|
||||
host = self._connmap[connection]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
del self._connmap[connection]
|
||||
del self._readymap[connection]
|
||||
self._hostmap[host].remove(connection)
|
||||
if not self._hostmap[host]: del self._hostmap[host]
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
def set_ready(self, connection, ready):
|
||||
try: self._readymap[connection] = ready
|
||||
except KeyError: pass
|
||||
|
||||
def get_ready_conn(self, host):
|
||||
conn = None
|
||||
self._lock.acquire()
|
||||
try:
|
||||
if self._hostmap.has_key(host):
|
||||
for c in self._hostmap[host]:
|
||||
if self._readymap[c]:
|
||||
self._readymap[c] = 0
|
||||
conn = c
|
||||
break
|
||||
finally:
|
||||
self._lock.release()
|
||||
return conn
|
||||
|
||||
def get_all(self, host=None):
|
||||
if host:
|
||||
return list(self._hostmap.get(host, []))
|
||||
else:
|
||||
return dict(self._hostmap)
|
||||
|
||||
class KeepAliveHandler:
|
||||
def __init__(self):
|
||||
self._cm = ConnectionManager()
|
||||
|
||||
#### Connection Management
|
||||
def open_connections(self):
|
||||
"""return a list of connected hosts and the number of connections
|
||||
to each. [('foo.com:80', 2), ('bar.org', 1)]"""
|
||||
return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
|
||||
|
||||
def close_connection(self, host):
|
||||
"""close connection(s) to <host>
|
||||
host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
|
||||
no error occurs if there is no connection to that host."""
|
||||
for h in self._cm.get_all(host):
|
||||
self._cm.remove(h)
|
||||
h.close()
|
||||
|
||||
def close_all(self):
|
||||
"""close all open connections"""
|
||||
for host, conns in self._cm.get_all().items():
|
||||
for h in conns:
|
||||
self._cm.remove(h)
|
||||
h.close()
|
||||
|
||||
def _request_closed(self, request, host, connection):
|
||||
"""tells us that this request is now closed and the the
|
||||
connection is ready for another request"""
|
||||
self._cm.set_ready(connection, 1)
|
||||
|
||||
def _remove_connection(self, host, connection, close=0):
|
||||
if close: connection.close()
|
||||
self._cm.remove(connection)
|
||||
|
||||
#### Transaction Execution
|
||||
def do_open(self, req):
|
||||
host = req.get_host()
|
||||
if not host:
|
||||
raise urllib2.URLError('no host given')
|
||||
|
||||
try:
|
||||
h = self._cm.get_ready_conn(host)
|
||||
while h:
|
||||
r = self._reuse_connection(h, req, host)
|
||||
|
||||
# if this response is non-None, then it worked and we're
|
||||
# done. Break out, skipping the else block.
|
||||
if r: break
|
||||
|
||||
# connection is bad - possibly closed by server
|
||||
# discard it and ask for the next free connection
|
||||
h.close()
|
||||
self._cm.remove(h)
|
||||
h = self._cm.get_ready_conn(host)
|
||||
else:
|
||||
# no (working) free connections were found. Create a new one.
|
||||
h = self._get_connection(host)
|
||||
if DEBUG: DEBUG.info("creating new connection to %s (%d)",
|
||||
host, id(h))
|
||||
self._cm.add(host, h, 0)
|
||||
self._start_transaction(h, req)
|
||||
r = h.getresponse()
|
||||
except (socket.error, httplib.HTTPException), err:
|
||||
raise urllib2.URLError(err)
|
||||
|
||||
# if not a persistent connection, don't try to reuse it
|
||||
if r.will_close: self._cm.remove(h)
|
||||
|
||||
if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
|
||||
r._handler = self
|
||||
r._host = host
|
||||
r._url = req.get_full_url()
|
||||
r._connection = h
|
||||
r.code = r.status
|
||||
r.headers = r.msg
|
||||
r.msg = r.reason
|
||||
|
||||
if r.status == 200 or not HANDLE_ERRORS:
|
||||
return r
|
||||
else:
|
||||
return self.parent.error('http', req, r,
|
||||
r.status, r.msg, r.headers)
|
||||
|
||||
def _reuse_connection(self, h, req, host):
|
||||
"""start the transaction with a re-used connection
|
||||
return a response object (r) upon success or None on failure.
|
||||
This DOES not close or remove bad connections in cases where
|
||||
it returns. However, if an unexpected exception occurs, it
|
||||
will close and remove the connection before re-raising.
|
||||
"""
|
||||
try:
|
||||
self._start_transaction(h, req)
|
||||
r = h.getresponse()
|
||||
# note: just because we got something back doesn't mean it
|
||||
# worked. We'll check the version below, too.
|
||||
except (socket.error, httplib.HTTPException):
|
||||
r = None
|
||||
except:
|
||||
# adding this block just in case we've missed
|
||||
# something we will still raise the exception, but
|
||||
# lets try and close the connection and remove it
|
||||
# first. We previously got into a nasty loop
|
||||
# where an exception was uncaught, and so the
|
||||
# connection stayed open. On the next try, the
|
||||
# same exception was raised, etc. The tradeoff is
|
||||
# that it's now possible this call will raise
|
||||
# a DIFFERENT exception
|
||||
if DEBUG: DEBUG.error("unexpected exception - closing " + \
|
||||
"connection to %s (%d)", host, id(h))
|
||||
self._cm.remove(h)
|
||||
h.close()
|
||||
raise
|
||||
|
||||
if r is None or r.version == 9:
|
||||
# httplib falls back to assuming HTTP 0.9 if it gets a
|
||||
# bad header back. This is most likely to happen if
|
||||
# the socket has been closed by the server since we
|
||||
# last used the connection.
|
||||
if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
|
||||
host, id(h))
|
||||
r = None
|
||||
else:
|
||||
if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
|
||||
|
||||
return r
|
||||
|
||||
def _start_transaction(self, h, req):
|
||||
try:
|
||||
if req.has_data():
|
||||
data = req.get_data()
|
||||
h.putrequest('POST', req.get_selector())
|
||||
if not req.headers.has_key('Content-type'):
|
||||
h.putheader('Content-type',
|
||||
'application/x-www-form-urlencoded')
|
||||
if not req.headers.has_key('Content-length'):
|
||||
h.putheader('Content-length', '%d' % len(data))
|
||||
else:
|
||||
h.putrequest('GET', req.get_selector())
|
||||
except (socket.error, httplib.HTTPException), err:
|
||||
raise urllib2.URLError(err)
|
||||
|
||||
for args in self.parent.addheaders:
|
||||
h.putheader(*args)
|
||||
for k, v in req.headers.items():
|
||||
h.putheader(k, v)
|
||||
h.endheaders()
|
||||
if req.has_data():
|
||||
h.send(data)
|
||||
|
||||
def _get_connection(self, host):
|
||||
return NotImplementedError
|
||||
|
||||
class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
|
||||
def __init__(self):
|
||||
KeepAliveHandler.__init__(self)
|
||||
|
||||
def http_open(self, req):
|
||||
return self.do_open(req)
|
||||
|
||||
def _get_connection(self, host):
|
||||
return HTTPConnection(host)
|
||||
|
||||
class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
|
||||
def __init__(self, ssl_factory=None):
|
||||
KeepAliveHandler.__init__(self)
|
||||
if not ssl_factory:
|
||||
ssl_factory = sslfactory.get_factory()
|
||||
self._ssl_factory = ssl_factory
|
||||
|
||||
def https_open(self, req):
|
||||
return self.do_open(req)
|
||||
|
||||
def _get_connection(self, host):
|
||||
return self._ssl_factory.get_https_connection(host)
|
||||
|
||||
class HTTPResponse(httplib.HTTPResponse):
|
||||
# we need to subclass HTTPResponse in order to
|
||||
# 1) add readline() and readlines() methods
|
||||
# 2) add close_connection() methods
|
||||
# 3) add info() and geturl() methods
|
||||
|
||||
# in order to add readline(), read must be modified to deal with a
|
||||
# buffer. example: readline must read a buffer and then spit back
|
||||
# one line at a time. The only real alternative is to read one
|
||||
# BYTE at a time (ick). Once something has been read, it can't be
|
||||
# put back (ok, maybe it can, but that's even uglier than this),
|
||||
# so if you THEN do a normal read, you must first take stuff from
|
||||
# the buffer.
|
||||
|
||||
# the read method wraps the original to accomodate buffering,
|
||||
# although read() never adds to the buffer.
|
||||
# Both readline and readlines have been stolen with almost no
|
||||
# modification from socket.py
|
||||
|
||||
|
||||
def __init__(self, sock, debuglevel=0, strict=0, method=None):
|
||||
if method: # the httplib in python 2.3 uses the method arg
|
||||
httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
|
||||
else: # 2.2 doesn't
|
||||
httplib.HTTPResponse.__init__(self, sock, debuglevel)
|
||||
self.fileno = sock.fileno
|
||||
self.code = None
|
||||
self._rbuf = ''
|
||||
self._rbufsize = 8096
|
||||
self._handler = None # inserted by the handler later
|
||||
self._host = None # (same)
|
||||
self._url = None # (same)
|
||||
self._connection = None # (same)
|
||||
|
||||
_raw_read = httplib.HTTPResponse.read
|
||||
|
||||
def close(self):
|
||||
if self.fp:
|
||||
self.fp.close()
|
||||
self.fp = None
|
||||
if self._handler:
|
||||
self._handler._request_closed(self, self._host,
|
||||
self._connection)
|
||||
|
||||
def close_connection(self):
|
||||
self._handler._remove_connection(self._host, self._connection, close=1)
|
||||
self.close()
|
||||
|
||||
def info(self):
|
||||
return self.headers
|
||||
|
||||
def geturl(self):
|
||||
return self._url
|
||||
|
||||
def read(self, amt=None):
|
||||
# the _rbuf test is only in this first if for speed. It's not
|
||||
# logically necessary
|
||||
if self._rbuf and not amt is None:
|
||||
L = len(self._rbuf)
|
||||
if amt > L:
|
||||
amt -= L
|
||||
else:
|
||||
s = self._rbuf[:amt]
|
||||
self._rbuf = self._rbuf[amt:]
|
||||
return s
|
||||
|
||||
s = self._rbuf + self._raw_read(amt)
|
||||
self._rbuf = ''
|
||||
return s
|
||||
|
||||
def readline(self, limit=-1):
|
||||
data = ""
|
||||
i = self._rbuf.find('\n')
|
||||
while i < 0 and not (0 < limit <= len(self._rbuf)):
|
||||
new = self._raw_read(self._rbufsize)
|
||||
if not new: break
|
||||
i = new.find('\n')
|
||||
if i >= 0: i = i + len(self._rbuf)
|
||||
self._rbuf = self._rbuf + new
|
||||
if i < 0: i = len(self._rbuf)
|
||||
else: i = i+1
|
||||
if 0 <= limit < len(self._rbuf): i = limit
|
||||
data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
|
||||
return data
|
||||
|
||||
def readlines(self, sizehint = 0):
|
||||
total = 0
|
||||
list = []
|
||||
while 1:
|
||||
line = self.readline()
|
||||
if not line: break
|
||||
list.append(line)
|
||||
total += len(line)
|
||||
if sizehint and total >= sizehint:
|
||||
break
|
||||
return list
|
||||
|
||||
|
||||
class HTTPConnection(httplib.HTTPConnection):
|
||||
# use the modified response class
|
||||
response_class = HTTPResponse
|
||||
|
||||
class HTTPSConnection(httplib.HTTPSConnection):
|
||||
response_class = HTTPResponse
|
||||
|
||||
#########################################################################
|
||||
##### TEST FUNCTIONS
|
||||
#########################################################################
|
||||
|
||||
def error_handler(url):
|
||||
global HANDLE_ERRORS
|
||||
orig = HANDLE_ERRORS
|
||||
keepalive_handler = HTTPHandler()
|
||||
opener = urllib2.build_opener(keepalive_handler)
|
||||
urllib2.install_opener(opener)
|
||||
pos = {0: 'off', 1: 'on'}
|
||||
for i in (0, 1):
|
||||
print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
|
||||
HANDLE_ERRORS = i
|
||||
try:
|
||||
fo = urllib2.urlopen(url)
|
||||
foo = fo.read()
|
||||
fo.close()
|
||||
try: status, reason = fo.status, fo.reason
|
||||
except AttributeError: status, reason = None, None
|
||||
except IOError, e:
|
||||
print " EXCEPTION: %s" % e
|
||||
raise
|
||||
else:
|
||||
print " status = %s, reason = %s" % (status, reason)
|
||||
HANDLE_ERRORS = orig
|
||||
hosts = keepalive_handler.open_connections()
|
||||
print "open connections:", hosts
|
||||
keepalive_handler.close_all()
|
||||
|
||||
def continuity(url):
|
||||
import md5
|
||||
format = '%25s: %s'
|
||||
|
||||
# first fetch the file with the normal http handler
|
||||
opener = urllib2.build_opener()
|
||||
urllib2.install_opener(opener)
|
||||
fo = urllib2.urlopen(url)
|
||||
foo = fo.read()
|
||||
fo.close()
|
||||
m = md5.new(foo)
|
||||
print format % ('normal urllib', m.hexdigest())
|
||||
|
||||
# now install the keepalive handler and try again
|
||||
opener = urllib2.build_opener(HTTPHandler())
|
||||
urllib2.install_opener(opener)
|
||||
|
||||
fo = urllib2.urlopen(url)
|
||||
foo = fo.read()
|
||||
fo.close()
|
||||
m = md5.new(foo)
|
||||
print format % ('keepalive read', m.hexdigest())
|
||||
|
||||
fo = urllib2.urlopen(url)
|
||||
foo = ''
|
||||
while 1:
|
||||
f = fo.readline()
|
||||
if f: foo = foo + f
|
||||
else: break
|
||||
fo.close()
|
||||
m = md5.new(foo)
|
||||
print format % ('keepalive readline', m.hexdigest())
|
||||
|
||||
def comp(N, url):
|
||||
print ' making %i connections to:\n %s' % (N, url)
|
||||
|
||||
sys.stdout.write(' first using the normal urllib handlers')
|
||||
# first use normal opener
|
||||
opener = urllib2.build_opener()
|
||||
urllib2.install_opener(opener)
|
||||
t1 = fetch(N, url)
|
||||
print ' TIME: %.3f s' % t1
|
||||
|
||||
sys.stdout.write(' now using the keepalive handler ')
|
||||
# now install the keepalive handler and try again
|
||||
opener = urllib2.build_opener(HTTPHandler())
|
||||
urllib2.install_opener(opener)
|
||||
t2 = fetch(N, url)
|
||||
print ' TIME: %.3f s' % t2
|
||||
print ' improvement factor: %.2f' % (t1/t2, )
|
||||
|
||||
def fetch(N, url, delay=0):
|
||||
import time
|
||||
lens = []
|
||||
starttime = time.time()
|
||||
for i in range(N):
|
||||
if delay and i > 0: time.sleep(delay)
|
||||
fo = urllib2.urlopen(url)
|
||||
foo = fo.read()
|
||||
fo.close()
|
||||
lens.append(len(foo))
|
||||
diff = time.time() - starttime
|
||||
|
||||
j = 0
|
||||
for i in lens[1:]:
|
||||
j = j + 1
|
||||
if not i == lens[0]:
|
||||
print "WARNING: inconsistent length on read %i: %i" % (j, i)
|
||||
|
||||
return diff
|
||||
|
||||
def test_timeout(url):
|
||||
global DEBUG
|
||||
dbbackup = DEBUG
|
||||
class FakeLogger:
|
||||
def debug(self, msg, *args): print msg % args
|
||||
info = warning = error = debug
|
||||
DEBUG = FakeLogger()
|
||||
print " fetching the file to establish a connection"
|
||||
fo = urllib2.urlopen(url)
|
||||
data1 = fo.read()
|
||||
fo.close()
|
||||
|
||||
i = 20
|
||||
print " waiting %i seconds for the server to close the connection" % i
|
||||
while i > 0:
|
||||
sys.stdout.write('\r %2i' % i)
|
||||
sys.stdout.flush()
|
||||
time.sleep(1)
|
||||
i -= 1
|
||||
sys.stderr.write('\r')
|
||||
|
||||
print " fetching the file a second time"
|
||||
fo = urllib2.urlopen(url)
|
||||
data2 = fo.read()
|
||||
fo.close()
|
||||
|
||||
if data1 == data2:
|
||||
print ' data are identical'
|
||||
else:
|
||||
print ' ERROR: DATA DIFFER'
|
||||
|
||||
DEBUG = dbbackup
|
||||
|
||||
|
||||
def test(url, N=10):
|
||||
print "checking error hander (do this on a non-200)"
|
||||
try: error_handler(url)
|
||||
except IOError, e:
|
||||
print "exiting - exception will prevent further tests"
|
||||
sys.exit()
|
||||
print
|
||||
print "performing continuity test (making sure stuff isn't corrupted)"
|
||||
continuity(url)
|
||||
print
|
||||
print "performing speed comparison"
|
||||
comp(N, url)
|
||||
print
|
||||
print "performing dropped-connection check"
|
||||
test_timeout(url)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
import sys
|
||||
try:
|
||||
N = int(sys.argv[1])
|
||||
url = sys.argv[2]
|
||||
except:
|
||||
print "%s <integer> <url>" % sys.argv[0]
|
||||
else:
|
||||
test(url, N)
|
||||
@@ -1,458 +0,0 @@
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place, Suite 330,
|
||||
# Boston, MA 02111-1307 USA
|
||||
|
||||
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
||||
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
|
||||
|
||||
"""Module for downloading files from a pool of mirrors
|
||||
|
||||
DESCRIPTION
|
||||
|
||||
This module provides support for downloading files from a pool of
|
||||
mirrors with configurable failover policies. To a large extent, the
|
||||
failover policy is chosen by using different classes derived from
|
||||
the main class, MirrorGroup.
|
||||
|
||||
Instances of MirrorGroup (and cousins) act very much like URLGrabber
|
||||
instances in that they have urlread, urlgrab, and urlopen methods.
|
||||
They can therefore, be used in very similar ways.
|
||||
|
||||
from urlgrabber.grabber import URLGrabber
|
||||
from urlgrabber.mirror import MirrorGroup
|
||||
gr = URLGrabber()
|
||||
mg = MirrorGroup(gr, ['http://foo.com/some/directory/',
|
||||
'http://bar.org/maybe/somewhere/else/',
|
||||
'ftp://baz.net/some/other/place/entirely/']
|
||||
mg.urlgrab('relative/path.zip')
|
||||
|
||||
The assumption is that all mirrors are identical AFTER the base urls
|
||||
specified, so that any mirror can be used to fetch any file.
|
||||
|
||||
FAILOVER
|
||||
|
||||
The failover mechanism is designed to be customized by subclassing
|
||||
from MirrorGroup to change the details of the behavior. In general,
|
||||
the classes maintain a master mirror list and a "current mirror"
|
||||
index. When a download is initiated, a copy of this list and index
|
||||
is created for that download only. The specific failover policy
|
||||
depends on the class used, and so is documented in the class
|
||||
documentation. Note that ANY behavior of the class can be
|
||||
overridden, so any failover policy at all is possible (although
|
||||
you may need to change the interface in extreme cases).
|
||||
|
||||
CUSTOMIZATION
|
||||
|
||||
Most customization of a MirrorGroup object is done at instantiation
|
||||
time (or via subclassing). There are four major types of
|
||||
customization:
|
||||
|
||||
1) Pass in a custom urlgrabber - The passed in urlgrabber will be
|
||||
used (by default... see #2) for the grabs, so options to it
|
||||
apply for the url-fetching
|
||||
|
||||
2) Custom mirror list - Mirror lists can simply be a list of
|
||||
stings mirrors (as shown in the example above) but each can
|
||||
also be a dict, allowing for more options. For example, the
|
||||
first mirror in the list above could also have been:
|
||||
|
||||
{'mirror': 'http://foo.com/some/directory/',
|
||||
'grabber': <a custom grabber to be used for this mirror>,
|
||||
'kwargs': { <a dict of arguments passed to the grabber> }}
|
||||
|
||||
All mirrors are converted to this format internally. If
|
||||
'grabber' is omitted, the default grabber will be used. If
|
||||
kwargs are omitted, then (duh) they will not be used.
|
||||
|
||||
3) Pass keyword arguments when instantiating the mirror group.
|
||||
See, for example, the failure_callback argument.
|
||||
|
||||
4) Finally, any kwargs passed in for the specific file (to the
|
||||
urlgrab method, for example) will be folded in. The options
|
||||
passed into the grabber's urlXXX methods will override any
|
||||
options specified in a custom mirror dict.
|
||||
|
||||
"""
|
||||
|
||||
# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $
|
||||
|
||||
import random
|
||||
import thread # needed for locking to make this threadsafe
|
||||
|
||||
from grabber import URLGrabError, CallbackObject, DEBUG
|
||||
|
||||
try:
|
||||
from i18n import _
|
||||
except ImportError, msg:
|
||||
def _(st): return st
|
||||
|
||||
class GrabRequest:
|
||||
"""This is a dummy class used to hold information about the specific
|
||||
request. For example, a single file. By maintaining this information
|
||||
separately, we can accomplish two things:
|
||||
|
||||
1) make it a little easier to be threadsafe
|
||||
2) have request-specific parameters
|
||||
"""
|
||||
pass
|
||||
|
||||
class MirrorGroup:
|
||||
"""Base Mirror class
|
||||
|
||||
Instances of this class are built with a grabber object and a list
|
||||
of mirrors. Then all calls to urlXXX should be passed relative urls.
|
||||
The requested file will be searched for on the first mirror. If the
|
||||
grabber raises an exception (possibly after some retries) then that
|
||||
mirror will be removed from the list, and the next will be attempted.
|
||||
If all mirrors are exhausted, then an exception will be raised.
|
||||
|
||||
MirrorGroup has the following failover policy:
|
||||
|
||||
* downloads begin with the first mirror
|
||||
|
||||
* by default (see default_action below) a failure (after retries)
|
||||
causes it to increment the local AND master indices. Also,
|
||||
the current mirror is removed from the local list (but NOT the
|
||||
master list - the mirror can potentially be used for other
|
||||
files)
|
||||
|
||||
* if the local list is ever exhausted, a URLGrabError will be
|
||||
raised (errno=256, no more mirrors)
|
||||
|
||||
OPTIONS
|
||||
|
||||
In addition to the required arguments "grabber" and "mirrors",
|
||||
MirrorGroup also takes the following optional arguments:
|
||||
|
||||
default_action
|
||||
|
||||
A dict that describes the actions to be taken upon failure
|
||||
(after retries). default_action can contain any of the
|
||||
following keys (shown here with their default values):
|
||||
|
||||
default_action = {'increment': 1,
|
||||
'increment_master': 1,
|
||||
'remove': 1,
|
||||
'remove_master': 0,
|
||||
'fail': 0}
|
||||
|
||||
In this context, 'increment' means "use the next mirror" and
|
||||
'remove' means "never use this mirror again". The two
|
||||
'master' values refer to the instance-level mirror list (used
|
||||
for all files), whereas the non-master values refer to the
|
||||
current download only.
|
||||
|
||||
The 'fail' option will cause immediate failure by re-raising
|
||||
the exception and no further attempts to get the current
|
||||
download.
|
||||
|
||||
This dict can be set at instantiation time,
|
||||
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
|
||||
at method-execution time (only applies to current fetch),
|
||||
filename = mg.urlgrab(url, default_action={'increment': 0})
|
||||
or by returning an action dict from the failure_callback
|
||||
return {'fail':0}
|
||||
in increasing precedence.
|
||||
|
||||
If all three of these were done, the net result would be:
|
||||
{'increment': 0, # set in method
|
||||
'increment_master': 1, # class default
|
||||
'remove': 1, # class default
|
||||
'remove_master': 0, # class default
|
||||
'fail': 0} # set at instantiation, reset
|
||||
# from callback
|
||||
|
||||
failure_callback
|
||||
|
||||
this is a callback that will be called when a mirror "fails",
|
||||
meaning the grabber raises some URLGrabError. If this is a
|
||||
tuple, it is interpreted to be of the form (cb, args, kwargs)
|
||||
where cb is the actual callable object (function, method,
|
||||
etc). Otherwise, it is assumed to be the callable object
|
||||
itself. The callback will be passed a grabber.CallbackObject
|
||||
instance along with args and kwargs (if present). The following
|
||||
attributes are defined withing the instance:
|
||||
|
||||
obj.exception = < exception that was raised >
|
||||
obj.mirror = < the mirror that was tried >
|
||||
obj.relative_url = < url relative to the mirror >
|
||||
obj.url = < full url that failed >
|
||||
# .url is just the combination of .mirror
|
||||
# and .relative_url
|
||||
|
||||
The failure callback can return an action dict, as described
|
||||
above.
|
||||
|
||||
Like default_action, the failure_callback can be set at
|
||||
instantiation time or when the urlXXX method is called. In
|
||||
the latter case, it applies only for that fetch.
|
||||
|
||||
The callback can re-raise the exception quite easily. For
|
||||
example, this is a perfectly adequate callback function:
|
||||
|
||||
def callback(obj): raise obj.exception
|
||||
|
||||
WARNING: do not save the exception object (or the
|
||||
CallbackObject instance). As they contain stack frame
|
||||
references, they can lead to circular references.
|
||||
|
||||
Notes:
|
||||
* The behavior can be customized by deriving and overriding the
|
||||
'CONFIGURATION METHODS'
|
||||
* The 'grabber' instance is kept as a reference, not copied.
|
||||
Therefore, the grabber instance can be modified externally
|
||||
and changes will take effect immediately.
|
||||
"""
|
||||
|
||||
# notes on thread-safety:
|
||||
|
||||
# A GrabRequest should never be shared by multiple threads because
|
||||
# it's never saved inside the MG object and never returned outside it.
|
||||
# therefore, it should be safe to access/modify grabrequest data
|
||||
# without a lock. However, accessing the mirrors and _next attributes
|
||||
# of the MG itself must be done when locked to prevent (for example)
|
||||
# removal of the wrong mirror.
|
||||
|
||||
##############################################################
|
||||
# CONFIGURATION METHODS - intended to be overridden to
|
||||
# customize behavior
|
||||
def __init__(self, grabber, mirrors, **kwargs):
|
||||
"""Initialize the MirrorGroup object.
|
||||
|
||||
REQUIRED ARGUMENTS
|
||||
|
||||
grabber - URLGrabber instance
|
||||
mirrors - a list of mirrors
|
||||
|
||||
OPTIONAL ARGUMENTS
|
||||
|
||||
failure_callback - callback to be used when a mirror fails
|
||||
default_action - dict of failure actions
|
||||
|
||||
See the module-level and class level documentation for more
|
||||
details.
|
||||
"""
|
||||
|
||||
# OVERRIDE IDEAS:
|
||||
# shuffle the list to randomize order
|
||||
self.grabber = grabber
|
||||
self.mirrors = self._parse_mirrors(mirrors)
|
||||
self._next = 0
|
||||
self._lock = thread.allocate_lock()
|
||||
self.default_action = None
|
||||
self._process_kwargs(kwargs)
|
||||
|
||||
# if these values are found in **kwargs passed to one of the urlXXX
|
||||
# methods, they will be stripped before getting passed on to the
|
||||
# grabber
|
||||
options = ['default_action', 'failure_callback']
|
||||
|
||||
def _process_kwargs(self, kwargs):
|
||||
self.failure_callback = kwargs.get('failure_callback')
|
||||
self.default_action = kwargs.get('default_action')
|
||||
|
||||
def _parse_mirrors(self, mirrors):
|
||||
parsed_mirrors = []
|
||||
for m in mirrors:
|
||||
if type(m) == type(''): m = {'mirror': m}
|
||||
parsed_mirrors.append(m)
|
||||
return parsed_mirrors
|
||||
|
||||
def _load_gr(self, gr):
|
||||
# OVERRIDE IDEAS:
|
||||
# shuffle gr list
|
||||
self._lock.acquire()
|
||||
gr.mirrors = list(self.mirrors)
|
||||
gr._next = self._next
|
||||
self._lock.release()
|
||||
|
||||
def _get_mirror(self, gr):
|
||||
# OVERRIDE IDEAS:
|
||||
# return a random mirror so that multiple mirrors get used
|
||||
# even without failures.
|
||||
if not gr.mirrors:
|
||||
raise URLGrabError(256, _('No more mirrors to try.'))
|
||||
return gr.mirrors[gr._next]
|
||||
|
||||
def _failure(self, gr, cb_obj):
|
||||
# OVERRIDE IDEAS:
|
||||
# inspect the error - remove=1 for 404, remove=2 for connection
|
||||
# refused, etc. (this can also be done via
|
||||
# the callback)
|
||||
cb = gr.kw.get('failure_callback') or self.failure_callback
|
||||
if cb:
|
||||
if type(cb) == type( () ):
|
||||
cb, args, kwargs = cb
|
||||
else:
|
||||
args, kwargs = (), {}
|
||||
action = cb(cb_obj, *args, **kwargs) or {}
|
||||
else:
|
||||
action = {}
|
||||
# XXXX - decide - there are two ways to do this
|
||||
# the first is action-overriding as a whole - use the entire action
|
||||
# or fall back on module level defaults
|
||||
#action = action or gr.kw.get('default_action') or self.default_action
|
||||
# the other is to fall through for each element in the action dict
|
||||
a = dict(self.default_action or {})
|
||||
a.update(gr.kw.get('default_action', {}))
|
||||
a.update(action)
|
||||
action = a
|
||||
self.increment_mirror(gr, action)
|
||||
if action and action.get('fail', 0): raise
|
||||
|
||||
def increment_mirror(self, gr, action={}):
|
||||
"""Tell the mirror object increment the mirror index
|
||||
|
||||
This increments the mirror index, which amounts to telling the
|
||||
mirror object to use a different mirror (for this and future
|
||||
downloads).
|
||||
|
||||
This is a SEMI-public method. It will be called internally,
|
||||
and you may never need to call it. However, it is provided
|
||||
(and is made public) so that the calling program can increment
|
||||
the mirror choice for methods like urlopen. For example, with
|
||||
urlopen, there's no good way for the mirror group to know that
|
||||
an error occurs mid-download (it's already returned and given
|
||||
you the file object).
|
||||
|
||||
remove --- can have several values
|
||||
0 do not remove the mirror from the list
|
||||
1 remove the mirror for this download only
|
||||
2 remove the mirror permanently
|
||||
|
||||
beware of remove=0 as it can lead to infinite loops
|
||||
"""
|
||||
badmirror = gr.mirrors[gr._next]
|
||||
|
||||
self._lock.acquire()
|
||||
try:
|
||||
ind = self.mirrors.index(badmirror)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
if action.get('remove_master', 0):
|
||||
del self.mirrors[ind]
|
||||
elif self._next == ind and action.get('increment_master', 1):
|
||||
self._next += 1
|
||||
if self._next >= len(self.mirrors): self._next = 0
|
||||
self._lock.release()
|
||||
|
||||
if action.get('remove', 1):
|
||||
del gr.mirrors[gr._next]
|
||||
elif action.get('increment', 1):
|
||||
gr._next += 1
|
||||
if gr._next >= len(gr.mirrors): gr._next = 0
|
||||
|
||||
if DEBUG:
|
||||
grm = [m['mirror'] for m in gr.mirrors]
|
||||
DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
|
||||
selfm = [m['mirror'] for m in self.mirrors]
|
||||
DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
|
||||
|
||||
#####################################################################
|
||||
# NON-CONFIGURATION METHODS
|
||||
# these methods are designed to be largely workhorse methods that
|
||||
# are not intended to be overridden. That doesn't mean you can't;
|
||||
# if you want to, feel free, but most things can be done by
|
||||
# by overriding the configuration methods :)
|
||||
|
||||
def _join_url(self, base_url, rel_url):
|
||||
if base_url.endswith('/') or rel_url.startswith('/'):
|
||||
return base_url + rel_url
|
||||
else:
|
||||
return base_url + '/' + rel_url
|
||||
|
||||
def _mirror_try(self, func, url, kw):
|
||||
gr = GrabRequest()
|
||||
gr.func = func
|
||||
gr.url = url
|
||||
gr.kw = dict(kw)
|
||||
self._load_gr(gr)
|
||||
|
||||
for k in self.options:
|
||||
try: del kw[k]
|
||||
except KeyError: pass
|
||||
|
||||
while 1:
|
||||
mirrorchoice = self._get_mirror(gr)
|
||||
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
|
||||
kwargs = dict(mirrorchoice.get('kwargs', {}))
|
||||
kwargs.update(kw)
|
||||
grabber = mirrorchoice.get('grabber') or self.grabber
|
||||
func_ref = getattr(grabber, func)
|
||||
if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
|
||||
try:
|
||||
return func_ref( *(fullurl,), **kwargs )
|
||||
except URLGrabError, e:
|
||||
if DEBUG: DEBUG.info('MIRROR: failed')
|
||||
obj = CallbackObject()
|
||||
obj.exception = e
|
||||
obj.mirror = mirrorchoice['mirror']
|
||||
obj.relative_url = gr.url
|
||||
obj.url = fullurl
|
||||
self._failure(gr, obj)
|
||||
|
||||
def urlgrab(self, url, filename=None, **kwargs):
|
||||
kw = dict(kwargs)
|
||||
kw['filename'] = filename
|
||||
func = 'urlgrab'
|
||||
return self._mirror_try(func, url, kw)
|
||||
|
||||
def urlopen(self, url, **kwargs):
|
||||
kw = dict(kwargs)
|
||||
func = 'urlopen'
|
||||
return self._mirror_try(func, url, kw)
|
||||
|
||||
def urlread(self, url, limit=None, **kwargs):
|
||||
kw = dict(kwargs)
|
||||
kw['limit'] = limit
|
||||
func = 'urlread'
|
||||
return self._mirror_try(func, url, kw)
|
||||
|
||||
|
||||
class MGRandomStart(MirrorGroup):
|
||||
"""A mirror group that starts at a random mirror in the list.
|
||||
|
||||
This behavior of this class is identical to MirrorGroup, except that
|
||||
it starts at a random location in the mirror list.
|
||||
"""
|
||||
|
||||
def __init__(self, grabber, mirrors, **kwargs):
|
||||
"""Initialize the object
|
||||
|
||||
The arguments for intialization are the same as for MirrorGroup
|
||||
"""
|
||||
MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
|
||||
self._next = random.randrange(len(mirrors))
|
||||
|
||||
class MGRandomOrder(MirrorGroup):
|
||||
"""A mirror group that uses mirrors in a random order.
|
||||
|
||||
This behavior of this class is identical to MirrorGroup, except that
|
||||
it uses the mirrors in a random order. Note that the order is set at
|
||||
initialization time and fixed thereafter. That is, it does not pick a
|
||||
random mirror after each failure.
|
||||
"""
|
||||
|
||||
def __init__(self, grabber, mirrors, **kwargs):
|
||||
"""Initialize the object
|
||||
|
||||
The arguments for intialization are the same as for MirrorGroup
|
||||
"""
|
||||
MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
|
||||
random.shuffle(self.mirrors)
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
@@ -1,530 +0,0 @@
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place, Suite 330,
|
||||
# Boston, MA 02111-1307 USA
|
||||
|
||||
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
||||
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
|
||||
|
||||
# $Id: progress.py,v 1.7 2005/08/19 21:59:07 mstenner Exp $
|
||||
|
||||
import sys
|
||||
import time
|
||||
import math
|
||||
import thread
|
||||
|
||||
class BaseMeter:
|
||||
def __init__(self):
|
||||
self.update_period = 0.3 # seconds
|
||||
|
||||
self.filename = None
|
||||
self.url = None
|
||||
self.basename = None
|
||||
self.text = None
|
||||
self.size = None
|
||||
self.start_time = None
|
||||
self.last_amount_read = 0
|
||||
self.last_update_time = None
|
||||
self.re = RateEstimator()
|
||||
|
||||
def start(self, filename=None, url=None, basename=None,
|
||||
size=None, now=None, text=None):
|
||||
self.filename = filename
|
||||
self.url = url
|
||||
self.basename = basename
|
||||
self.text = text
|
||||
|
||||
#size = None ######### TESTING
|
||||
self.size = size
|
||||
if not size is None: self.fsize = format_number(size) + 'B'
|
||||
|
||||
if now is None: now = time.time()
|
||||
self.start_time = now
|
||||
self.re.start(size, now)
|
||||
self.last_amount_read = 0
|
||||
self.last_update_time = now
|
||||
self._do_start(now)
|
||||
|
||||
def _do_start(self, now=None):
|
||||
pass
|
||||
|
||||
def update(self, amount_read, now=None):
|
||||
# for a real gui, you probably want to override and put a call
|
||||
# to your mainloop iteration function here
|
||||
if now is None: now = time.time()
|
||||
if (now >= self.last_update_time + self.update_period) or \
|
||||
not self.last_update_time:
|
||||
self.re.update(amount_read, now)
|
||||
self.last_amount_read = amount_read
|
||||
self.last_update_time = now
|
||||
self._do_update(amount_read, now)
|
||||
|
||||
def _do_update(self, amount_read, now=None):
|
||||
pass
|
||||
|
||||
def end(self, amount_read, now=None):
|
||||
if now is None: now = time.time()
|
||||
self.re.update(amount_read, now)
|
||||
self.last_amount_read = amount_read
|
||||
self.last_update_time = now
|
||||
self._do_end(amount_read, now)
|
||||
|
||||
def _do_end(self, amount_read, now=None):
|
||||
pass
|
||||
|
||||
class TextMeter(BaseMeter):
|
||||
def __init__(self, fo=sys.stderr):
|
||||
BaseMeter.__init__(self)
|
||||
self.fo = fo
|
||||
|
||||
def _do_update(self, amount_read, now=None):
|
||||
etime = self.re.elapsed_time()
|
||||
fetime = format_time(etime)
|
||||
fread = format_number(amount_read)
|
||||
#self.size = None
|
||||
if self.text is not None:
|
||||
text = self.text
|
||||
else:
|
||||
text = self.basename
|
||||
if self.size is None:
|
||||
out = '\r%-60.60s %5sB %s ' % \
|
||||
(text, fread, fetime)
|
||||
else:
|
||||
rtime = self.re.remaining_time()
|
||||
frtime = format_time(rtime)
|
||||
frac = self.re.fraction_read()
|
||||
bar = '='*int(25 * frac)
|
||||
|
||||
out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \
|
||||
(text, frac*100, bar, fread, frtime)
|
||||
|
||||
self.fo.write(out)
|
||||
self.fo.flush()
|
||||
|
||||
def _do_end(self, amount_read, now=None):
|
||||
total_time = format_time(self.re.elapsed_time())
|
||||
total_size = format_number(amount_read)
|
||||
if self.text is not None:
|
||||
text = self.text
|
||||
else:
|
||||
text = self.basename
|
||||
if self.size is None:
|
||||
out = '\r%-60.60s %5sB %s ' % \
|
||||
(text, total_size, total_time)
|
||||
else:
|
||||
bar = '='*25
|
||||
out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \
|
||||
(text, 100, bar, total_size, total_time)
|
||||
self.fo.write(out + '\n')
|
||||
self.fo.flush()
|
||||
|
||||
text_progress_meter = TextMeter
|
||||
|
||||
class MultiFileHelper(BaseMeter):
|
||||
def __init__(self, master):
|
||||
BaseMeter.__init__(self)
|
||||
self.master = master
|
||||
|
||||
def _do_start(self, now):
|
||||
self.master.start_meter(self, now)
|
||||
|
||||
def _do_update(self, amount_read, now):
|
||||
# elapsed time since last update
|
||||
self.master.update_meter(self, now)
|
||||
|
||||
def _do_end(self, amount_read, now):
|
||||
self.ftotal_time = format_time(now - self.start_time)
|
||||
self.ftotal_size = format_number(self.last_amount_read)
|
||||
self.master.end_meter(self, now)
|
||||
|
||||
def failure(self, message, now=None):
|
||||
self.master.failure_meter(self, message, now)
|
||||
|
||||
def message(self, message):
|
||||
self.master.message_meter(self, message)
|
||||
|
||||
class MultiFileMeter:
|
||||
helperclass = MultiFileHelper
|
||||
def __init__(self):
|
||||
self.meters = []
|
||||
self.in_progress_meters = []
|
||||
self._lock = thread.allocate_lock()
|
||||
self.update_period = 0.3 # seconds
|
||||
|
||||
self.numfiles = None
|
||||
self.finished_files = 0
|
||||
self.failed_files = 0
|
||||
self.open_files = 0
|
||||
self.total_size = None
|
||||
self.failed_size = 0
|
||||
self.start_time = None
|
||||
self.finished_file_size = 0
|
||||
self.last_update_time = None
|
||||
self.re = RateEstimator()
|
||||
|
||||
def start(self, numfiles=None, total_size=None, now=None):
|
||||
if now is None: now = time.time()
|
||||
self.numfiles = numfiles
|
||||
self.finished_files = 0
|
||||
self.failed_files = 0
|
||||
self.open_files = 0
|
||||
self.total_size = total_size
|
||||
self.failed_size = 0
|
||||
self.start_time = now
|
||||
self.finished_file_size = 0
|
||||
self.last_update_time = now
|
||||
self.re.start(total_size, now)
|
||||
self._do_start(now)
|
||||
|
||||
def _do_start(self, now):
|
||||
pass
|
||||
|
||||
def end(self, now=None):
|
||||
if now is None: now = time.time()
|
||||
self._do_end(now)
|
||||
|
||||
def _do_end(self, now):
|
||||
pass
|
||||
|
||||
def lock(self): self._lock.acquire()
|
||||
def unlock(self): self._lock.release()
|
||||
|
||||
###########################################################
|
||||
# child meter creation and destruction
|
||||
def newMeter(self):
|
||||
newmeter = self.helperclass(self)
|
||||
self.meters.append(newmeter)
|
||||
return newmeter
|
||||
|
||||
def removeMeter(self, meter):
|
||||
self.meters.remove(meter)
|
||||
|
||||
###########################################################
|
||||
# child functions - these should only be called by helpers
|
||||
def start_meter(self, meter, now):
|
||||
if not meter in self.meters:
|
||||
raise ValueError('attempt to use orphaned meter')
|
||||
self._lock.acquire()
|
||||
try:
|
||||
if not meter in self.in_progress_meters:
|
||||
self.in_progress_meters.append(meter)
|
||||
self.open_files += 1
|
||||
finally:
|
||||
self._lock.release()
|
||||
self._do_start_meter(meter, now)
|
||||
|
||||
def _do_start_meter(self, meter, now):
|
||||
pass
|
||||
|
||||
def update_meter(self, meter, now):
|
||||
if not meter in self.meters:
|
||||
raise ValueError('attempt to use orphaned meter')
|
||||
if (now >= self.last_update_time + self.update_period) or \
|
||||
not self.last_update_time:
|
||||
self.re.update(self._amount_read(), now)
|
||||
self.last_update_time = now
|
||||
self._do_update_meter(meter, now)
|
||||
|
||||
def _do_update_meter(self, meter, now):
|
||||
pass
|
||||
|
||||
def end_meter(self, meter, now):
|
||||
if not meter in self.meters:
|
||||
raise ValueError('attempt to use orphaned meter')
|
||||
self._lock.acquire()
|
||||
try:
|
||||
try: self.in_progress_meters.remove(meter)
|
||||
except ValueError: pass
|
||||
self.open_files -= 1
|
||||
self.finished_files += 1
|
||||
self.finished_file_size += meter.last_amount_read
|
||||
finally:
|
||||
self._lock.release()
|
||||
self._do_end_meter(meter, now)
|
||||
|
||||
def _do_end_meter(self, meter, now):
|
||||
pass
|
||||
|
||||
def failure_meter(self, meter, message, now):
|
||||
if not meter in self.meters:
|
||||
raise ValueError('attempt to use orphaned meter')
|
||||
self._lock.acquire()
|
||||
try:
|
||||
try: self.in_progress_meters.remove(meter)
|
||||
except ValueError: pass
|
||||
self.open_files -= 1
|
||||
self.failed_files += 1
|
||||
if meter.size and self.failed_size is not None:
|
||||
self.failed_size += meter.size
|
||||
else:
|
||||
self.failed_size = None
|
||||
finally:
|
||||
self._lock.release()
|
||||
self._do_failure_meter(meter, message, now)
|
||||
|
||||
def _do_failure_meter(self, meter, message, now):
|
||||
pass
|
||||
|
||||
def message_meter(self, meter, message):
|
||||
pass
|
||||
|
||||
########################################################
|
||||
# internal functions
|
||||
def _amount_read(self):
|
||||
tot = self.finished_file_size
|
||||
for m in self.in_progress_meters:
|
||||
tot += m.last_amount_read
|
||||
return tot
|
||||
|
||||
|
||||
class TextMultiFileMeter(MultiFileMeter):
|
||||
def __init__(self, fo=sys.stderr):
|
||||
self.fo = fo
|
||||
MultiFileMeter.__init__(self)
|
||||
|
||||
# files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
|
||||
def _do_update_meter(self, meter, now):
|
||||
self._lock.acquire()
|
||||
try:
|
||||
format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
|
||||
"time: %8.8s/%8.8s"
|
||||
df = self.finished_files
|
||||
tf = self.numfiles or 1
|
||||
pf = 100 * float(df)/tf + 0.49
|
||||
dd = self.re.last_amount_read
|
||||
td = self.total_size
|
||||
pd = 100 * (self.re.fraction_read() or 0) + 0.49
|
||||
dt = self.re.elapsed_time()
|
||||
rt = self.re.remaining_time()
|
||||
if rt is None: tt = None
|
||||
else: tt = dt + rt
|
||||
|
||||
fdd = format_number(dd) + 'B'
|
||||
ftd = format_number(td) + 'B'
|
||||
fdt = format_time(dt, 1)
|
||||
ftt = format_time(tt, 1)
|
||||
|
||||
out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
|
||||
self.fo.write('\r' + out)
|
||||
self.fo.flush()
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
def _do_end_meter(self, meter, now):
|
||||
self._lock.acquire()
|
||||
try:
|
||||
format = "%-30.30s %6.6s %8.8s %9.9s"
|
||||
fn = meter.basename
|
||||
size = meter.last_amount_read
|
||||
fsize = format_number(size) + 'B'
|
||||
et = meter.re.elapsed_time()
|
||||
fet = format_time(et, 1)
|
||||
frate = format_number(size / et) + 'B/s'
|
||||
|
||||
out = '%-79.79s' % (format % (fn, fsize, fet, frate))
|
||||
self.fo.write('\r' + out + '\n')
|
||||
finally:
|
||||
self._lock.release()
|
||||
self._do_update_meter(meter, now)
|
||||
|
||||
def _do_failure_meter(self, meter, message, now):
|
||||
self._lock.acquire()
|
||||
try:
|
||||
format = "%-30.30s %6.6s %s"
|
||||
fn = meter.basename
|
||||
if type(message) in (type(''), type(u'')):
|
||||
message = message.splitlines()
|
||||
if not message: message = ['']
|
||||
out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
|
||||
self.fo.write('\r' + out + '\n')
|
||||
for m in message[1:]: self.fo.write(' ' + m + '\n')
|
||||
self._lock.release()
|
||||
finally:
|
||||
self._do_update_meter(meter, now)
|
||||
|
||||
def message_meter(self, meter, message):
|
||||
self._lock.acquire()
|
||||
try:
|
||||
pass
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
def _do_end(self, now):
|
||||
self._do_update_meter(None, now)
|
||||
self._lock.acquire()
|
||||
try:
|
||||
self.fo.write('\n')
|
||||
self.fo.flush()
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
######################################################################
|
||||
# support classes and functions
|
||||
|
||||
class RateEstimator:
|
||||
def __init__(self, timescale=5.0):
|
||||
self.timescale = timescale
|
||||
|
||||
def start(self, total=None, now=None):
|
||||
if now is None: now = time.time()
|
||||
self.total = total
|
||||
self.start_time = now
|
||||
self.last_update_time = now
|
||||
self.last_amount_read = 0
|
||||
self.ave_rate = None
|
||||
|
||||
def update(self, amount_read, now=None):
|
||||
if now is None: now = time.time()
|
||||
if amount_read == 0:
|
||||
# if we just started this file, all bets are off
|
||||
self.last_update_time = now
|
||||
self.last_amount_read = 0
|
||||
self.ave_rate = None
|
||||
return
|
||||
|
||||
#print 'times', now, self.last_update_time
|
||||
time_diff = now - self.last_update_time
|
||||
read_diff = amount_read - self.last_amount_read
|
||||
self.last_update_time = now
|
||||
self.last_amount_read = amount_read
|
||||
self.ave_rate = self._temporal_rolling_ave(\
|
||||
time_diff, read_diff, self.ave_rate, self.timescale)
|
||||
#print 'results', time_diff, read_diff, self.ave_rate
|
||||
|
||||
#####################################################################
|
||||
# result methods
|
||||
def average_rate(self):
|
||||
"get the average transfer rate (in bytes/second)"
|
||||
return self.ave_rate
|
||||
|
||||
def elapsed_time(self):
|
||||
"the time between the start of the transfer and the most recent update"
|
||||
return self.last_update_time - self.start_time
|
||||
|
||||
def remaining_time(self):
|
||||
"estimated time remaining"
|
||||
if not self.ave_rate or not self.total: return None
|
||||
return (self.total - self.last_amount_read) / self.ave_rate
|
||||
|
||||
def fraction_read(self):
|
||||
"""the fraction of the data that has been read
|
||||
(can be None for unknown transfer size)"""
|
||||
if self.total is None: return None
|
||||
elif self.total == 0: return 1.0
|
||||
else: return float(self.last_amount_read)/self.total
|
||||
|
||||
#########################################################################
|
||||
# support methods
|
||||
def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale):
|
||||
"""a temporal rolling average performs smooth averaging even when
|
||||
updates come at irregular intervals. This is performed by scaling
|
||||
the "epsilon" according to the time since the last update.
|
||||
Specifically, epsilon = time_diff / timescale
|
||||
|
||||
As a general rule, the average will take on a completely new value
|
||||
after 'timescale' seconds."""
|
||||
epsilon = time_diff / timescale
|
||||
if epsilon > 1: epsilon = 1.0
|
||||
return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
|
||||
|
||||
def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
|
||||
"""perform a "rolling average" iteration
|
||||
a rolling average "folds" new data into an existing average with
|
||||
some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive)
|
||||
a value of 0.0 means only the old value (initial value) counts,
|
||||
and a value of 1.0 means only the newest value is considered."""
|
||||
|
||||
try:
|
||||
recent_rate = read_diff / time_diff
|
||||
except ZeroDivisionError:
|
||||
recent_rate = None
|
||||
if last_ave is None: return recent_rate
|
||||
elif recent_rate is None: return last_ave
|
||||
|
||||
# at this point, both last_ave and recent_rate are numbers
|
||||
return epsilon * recent_rate + (1 - epsilon) * last_ave
|
||||
|
||||
def _round_remaining_time(self, rt, start_time=15.0):
|
||||
"""round the remaining time, depending on its size
|
||||
If rt is between n*start_time and (n+1)*start_time round downward
|
||||
to the nearest multiple of n (for any counting number n).
|
||||
If rt < start_time, round down to the nearest 1.
|
||||
For example (for start_time = 15.0):
|
||||
2.7 -> 2.0
|
||||
25.2 -> 25.0
|
||||
26.4 -> 26.0
|
||||
35.3 -> 34.0
|
||||
63.6 -> 60.0
|
||||
"""
|
||||
|
||||
if rt < 0: return 0.0
|
||||
shift = int(math.log(rt/start_time)/math.log(2))
|
||||
rt = int(rt)
|
||||
if shift <= 0: return rt
|
||||
return float(int(rt) >> shift << shift)
|
||||
|
||||
|
||||
def format_time(seconds, use_hours=0):
|
||||
if seconds is None or seconds < 0:
|
||||
if use_hours: return '--:--:--'
|
||||
else: return '--:--'
|
||||
else:
|
||||
seconds = int(seconds)
|
||||
minutes = seconds / 60
|
||||
seconds = seconds % 60
|
||||
if use_hours:
|
||||
hours = minutes / 60
|
||||
minutes = minutes % 60
|
||||
return '%02i:%02i:%02i' % (hours, minutes, seconds)
|
||||
else:
|
||||
return '%02i:%02i' % (minutes, seconds)
|
||||
|
||||
def format_number(number, SI=0, space=' '):
|
||||
"""Turn numbers into human-readable metric-like numbers"""
|
||||
symbols = ['', # (none)
|
||||
'k', # kilo
|
||||
'M', # mega
|
||||
'G', # giga
|
||||
'T', # tera
|
||||
'P', # peta
|
||||
'E', # exa
|
||||
'Z', # zetta
|
||||
'Y'] # yotta
|
||||
|
||||
if SI: step = 1000.0
|
||||
else: step = 1024.0
|
||||
|
||||
thresh = 999
|
||||
depth = 0
|
||||
max_depth = len(symbols) - 1
|
||||
|
||||
# we want numbers between 0 and thresh, but don't exceed the length
|
||||
# of our list. In that event, the formatting will be screwed up,
|
||||
# but it'll still show the right number.
|
||||
while number > thresh and depth < max_depth:
|
||||
depth = depth + 1
|
||||
number = number / step
|
||||
|
||||
if type(number) == type(1) or type(number) == type(1L):
|
||||
# it's an int or a long, which means it didn't get divided,
|
||||
# which means it's already short enough
|
||||
format = '%i%s%s'
|
||||
elif number < 9.95:
|
||||
# must use 9.95 for proper sizing. For example, 9.99 will be
|
||||
# rounded to 10.0 with the .1f format string (which is too long)
|
||||
format = '%.1f%s%s'
|
||||
else:
|
||||
format = '%.0f%s%s'
|
||||
|
||||
return(format % (float(number or 0), space, symbols[depth]))
|
||||
@@ -1,90 +0,0 @@
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 59 Temple Place, Suite 330,
|
||||
# Boston, MA 02111-1307 USA
|
||||
|
||||
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
||||
|
||||
import httplib
|
||||
import urllib2
|
||||
|
||||
try:
|
||||
from M2Crypto import SSL
|
||||
from M2Crypto import httpslib
|
||||
from M2Crypto import m2urllib2
|
||||
|
||||
SSL.Connection.clientPostConnectionCheck = None
|
||||
have_m2crypto = True
|
||||
except ImportError:
|
||||
have_m2crypto = False
|
||||
|
||||
DEBUG = None
|
||||
|
||||
if have_m2crypto:
|
||||
|
||||
class M2SSLFactory:
|
||||
|
||||
def __init__(self, ssl_ca_cert, ssl_context):
|
||||
self.ssl_context = self._get_ssl_context(ssl_ca_cert, ssl_context)
|
||||
|
||||
def _get_ssl_context(self, ssl_ca_cert, ssl_context):
|
||||
"""
|
||||
Create an ssl context using the CA cert file or ssl context.
|
||||
|
||||
The CA cert is used first if it was passed as an option. If not,
|
||||
then the supplied ssl context is used. If no ssl context was supplied,
|
||||
None is returned.
|
||||
"""
|
||||
if ssl_ca_cert:
|
||||
context = SSL.Context()
|
||||
context.load_verify_locations(ssl_ca_cert)
|
||||
context.set_verify(SSL.verify_none, -1)
|
||||
return context
|
||||
else:
|
||||
return ssl_context
|
||||
|
||||
def create_https_connection(self, host, response_class = None):
|
||||
connection = httplib.HTTPSConnection(host, self.ssl_context)
|
||||
if response_class:
|
||||
connection.response_class = response_class
|
||||
return connection
|
||||
|
||||
def create_opener(self, *handlers):
|
||||
return m2urllib2.build_opener(self.ssl_context, *handlers)
|
||||
|
||||
|
||||
class SSLFactory:
|
||||
|
||||
def create_https_connection(self, host, response_class = None):
|
||||
connection = httplib.HTTPSConnection(host)
|
||||
if response_class:
|
||||
connection.response_class = response_class
|
||||
return connection
|
||||
|
||||
def create_opener(self, *handlers):
|
||||
return urllib2.build_opener(*handlers)
|
||||
|
||||
|
||||
|
||||
def get_factory(ssl_ca_cert = None, ssl_context = None):
|
||||
""" Return an SSLFactory, based on if M2Crypto is available. """
|
||||
if have_m2crypto:
|
||||
return M2SSLFactory(ssl_ca_cert, ssl_context)
|
||||
else:
|
||||
# Log here if someone provides the args but we don't use them.
|
||||
if ssl_ca_cert or ssl_context:
|
||||
if DEBUG:
|
||||
DEBUG.warning("SSL arguments supplied, but M2Crypto is not available. "
|
||||
"Using Python SSL.")
|
||||
return SSLFactory()
|
||||
Reference in New Issue
Block a user