wic: Remove 3rdparty/urlgrabber

wic doesn't use it, so remove it.

(From OE-Core rev: 00dcdb29c89634ab267d328eb00f8eb70c696655)

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
This commit is contained in:
Tom Zanussi
2014-08-04 07:55:07 -05:00
committed by Richard Purdie
parent ba19b60fd2
commit c9b5ea0873
7 changed files with 0 additions and 3688 deletions

View File

@@ -1,53 +0,0 @@
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
# $Id: __init__.py,v 1.20 2006/09/22 00:58:55 mstenner Exp $
"""A high-level cross-protocol url-grabber.
Using urlgrabber, data can be fetched in three basic ways:
urlgrab(url) copy the file to the local filesystem
urlopen(url) open the remote file and return a file object
(like urllib2.urlopen)
urlread(url) return the contents of the file as a string
When using these functions (or methods), urlgrabber supports the
following features:
* identical behavior for http://, ftp://, and file:// urls
* http keepalive - faster downloads of many files by using
only a single connection
* byte ranges - fetch only a portion of the file
* reget - for a urlgrab, resume a partial download
* progress meters - the ability to report download progress
automatically, even when using urlopen!
* throttling - restrict bandwidth usage
* retries - automatically retry a download if it fails. The
number of retries and failure types are configurable.
* authenticated server access for http and ftp
* proxy support - support for authenticated http and ftp proxies
* mirror groups - treat a list of mirrors as a single source,
automatically switching mirrors if there is a failure.
"""
__version__ = '3.1.0'
__date__ = '2006/09/21'
__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
'Ryan Tomayko <rtomayko@naeblis.cx>'
__url__ = 'http://linux.duke.edu/projects/urlgrabber/'
from grabber import urlgrab, urlopen, urlread

View File

@@ -1,463 +0,0 @@
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
# $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $
import os
import stat
import urllib
import urllib2
import rfc822
DEBUG = None
try:
from cStringIO import StringIO
except ImportError, msg:
from StringIO import StringIO
class RangeError(IOError):
"""Error raised when an unsatisfiable range is requested."""
pass
class HTTPRangeHandler(urllib2.BaseHandler):
"""Handler that enables HTTP Range headers.
This was extremely simple. The Range header is a HTTP feature to
begin with so all this class does is tell urllib2 that the
"206 Partial Content" reponse from the HTTP server is what we
expected.
Example:
import urllib2
import byterange
range_handler = range.HTTPRangeHandler()
opener = urllib2.build_opener(range_handler)
# install it
urllib2.install_opener(opener)
# create Request and set Range header
req = urllib2.Request('http://www.python.org/')
req.header['Range'] = 'bytes=30-50'
f = urllib2.urlopen(req)
"""
def http_error_206(self, req, fp, code, msg, hdrs):
# 206 Partial Content Response
r = urllib.addinfourl(fp, hdrs, req.get_full_url())
r.code = code
r.msg = msg
return r
def http_error_416(self, req, fp, code, msg, hdrs):
# HTTP's Range Not Satisfiable error
raise RangeError('Requested Range Not Satisfiable')
class HTTPSRangeHandler(HTTPRangeHandler):
""" Range Header support for HTTPS. """
def https_error_206(self, req, fp, code, msg, hdrs):
return self.http_error_206(req, fp, code, msg, hdrs)
def https_error_416(self, req, fp, code, msg, hdrs):
self.https_error_416(req, fp, code, msg, hdrs)
class RangeableFileObject:
"""File object wrapper to enable raw range handling.
This was implemented primarilary for handling range
specifications for file:// urls. This object effectively makes
a file object look like it consists only of a range of bytes in
the stream.
Examples:
# expose 10 bytes, starting at byte position 20, from
# /etc/aliases.
>>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
# seek seeks within the range (to position 23 in this case)
>>> fo.seek(3)
# tell tells where your at _within the range_ (position 3 in
# this case)
>>> fo.tell()
# read EOFs if an attempt is made to read past the last
# byte in the range. the following will return only 7 bytes.
>>> fo.read(30)
"""
def __init__(self, fo, rangetup):
"""Create a RangeableFileObject.
fo -- a file like object. only the read() method need be
supported but supporting an optimized seek() is
preferable.
rangetup -- a (firstbyte,lastbyte) tuple specifying the range
to work over.
The file object provided is assumed to be at byte offset 0.
"""
self.fo = fo
(self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
self.realpos = 0
self._do_seek(self.firstbyte)
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for
in self.fo. This includes methods."""
if hasattr(self.fo, name):
return getattr(self.fo, name)
raise AttributeError, name
def tell(self):
"""Return the position within the range.
This is different from fo.seek in that position 0 is the
first byte position of the range tuple. For example, if
this object was created with a range tuple of (500,899),
tell() will return 0 when at byte position 500 of the file.
"""
return (self.realpos - self.firstbyte)
def seek(self,offset,whence=0):
"""Seek within the byte range.
Positioning is identical to that described under tell().
"""
assert whence in (0, 1, 2)
if whence == 0: # absolute seek
realoffset = self.firstbyte + offset
elif whence == 1: # relative seek
realoffset = self.realpos + offset
elif whence == 2: # absolute from end of file
# XXX: are we raising the right Error here?
raise IOError('seek from end of file not supported.')
# do not allow seek past lastbyte in range
if self.lastbyte and (realoffset >= self.lastbyte):
realoffset = self.lastbyte
self._do_seek(realoffset - self.realpos)
def read(self, size=-1):
"""Read within the range.
This method will limit the size read based on the range.
"""
size = self._calc_read_size(size)
rslt = self.fo.read(size)
self.realpos += len(rslt)
return rslt
def readline(self, size=-1):
"""Read lines within the range.
This method will limit the size read based on the range.
"""
size = self._calc_read_size(size)
rslt = self.fo.readline(size)
self.realpos += len(rslt)
return rslt
def _calc_read_size(self, size):
"""Handles calculating the amount of data to read based on
the range.
"""
if self.lastbyte:
if size > -1:
if ((self.realpos + size) >= self.lastbyte):
size = (self.lastbyte - self.realpos)
else:
size = (self.lastbyte - self.realpos)
return size
def _do_seek(self,offset):
"""Seek based on whether wrapped object supports seek().
offset is relative to the current position (self.realpos).
"""
assert offset >= 0
if not hasattr(self.fo, 'seek'):
self._poor_mans_seek(offset)
else:
self.fo.seek(self.realpos + offset)
self.realpos+= offset
def _poor_mans_seek(self,offset):
"""Seek by calling the wrapped file objects read() method.
This is used for file like objects that do not have native
seek support. The wrapped objects read() method is called
to manually seek to the desired position.
offset -- read this number of bytes from the wrapped
file object.
raise RangeError if we encounter EOF before reaching the
specified offset.
"""
pos = 0
bufsize = 1024
while pos < offset:
if (pos + bufsize) > offset:
bufsize = offset - pos
buf = self.fo.read(bufsize)
if len(buf) != bufsize:
raise RangeError('Requested Range Not Satisfiable')
pos+= bufsize
class FileRangeHandler(urllib2.FileHandler):
"""FileHandler subclass that adds Range support.
This class handles Range headers exactly like an HTTP
server would.
"""
def open_local_file(self, req):
import mimetypes
import mimetools
host = req.get_host()
file = req.get_selector()
localfile = urllib.url2pathname(file)
stats = os.stat(localfile)
size = stats[stat.ST_SIZE]
modified = rfc822.formatdate(stats[stat.ST_MTIME])
mtype = mimetypes.guess_type(file)[0]
if host:
host, port = urllib.splitport(host)
if port or socket.gethostbyname(host) not in self.get_names():
raise urllib2.URLError('file not on local host')
fo = open(localfile,'rb')
brange = req.headers.get('Range',None)
brange = range_header_to_tuple(brange)
assert brange != ()
if brange:
(fb,lb) = brange
if lb == '': lb = size
if fb < 0 or fb > size or lb > size:
raise RangeError('Requested Range Not Satisfiable')
size = (lb - fb)
fo = RangeableFileObject(fo, (fb,lb))
headers = mimetools.Message(StringIO(
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
(mtype or 'text/plain', size, modified)))
return urllib.addinfourl(fo, headers, 'file:'+file)
# FTP Range Support
# Unfortunately, a large amount of base FTP code had to be copied
# from urllib and urllib2 in order to insert the FTP REST command.
# Code modifications for range support have been commented as
# follows:
# -- range support modifications start/end here
from urllib import splitport, splituser, splitpasswd, splitattr, \
unquote, addclosehook, addinfourl
import ftplib
import socket
import sys
import ftplib
import mimetypes
import mimetools
class FTPRangeHandler(urllib2.FTPHandler):
def ftp_open(self, req):
host = req.get_host()
if not host:
raise IOError, ('ftp error', 'no host given')
host, port = splitport(host)
if port is None:
port = ftplib.FTP_PORT
# username/password handling
user, host = splituser(host)
if user:
user, passwd = splitpasswd(user)
else:
passwd = None
host = unquote(host)
user = unquote(user or '')
passwd = unquote(passwd or '')
try:
host = socket.gethostbyname(host)
except socket.error, msg:
raise urllib2.URLError(msg)
path, attrs = splitattr(req.get_selector())
dirs = path.split('/')
dirs = map(unquote, dirs)
dirs, file = dirs[:-1], dirs[-1]
if dirs and not dirs[0]:
dirs = dirs[1:]
try:
fw = self.connect_ftp(user, passwd, host, port, dirs)
type = file and 'I' or 'D'
for attr in attrs:
attr, value = splitattr(attr)
if attr.lower() == 'type' and \
value in ('a', 'A', 'i', 'I', 'd', 'D'):
type = value.upper()
# -- range support modifications start here
rest = None
range_tup = range_header_to_tuple(req.headers.get('Range',None))
assert range_tup != ()
if range_tup:
(fb,lb) = range_tup
if fb > 0: rest = fb
# -- range support modifications end here
fp, retrlen = fw.retrfile(file, type, rest)
# -- range support modifications start here
if range_tup:
(fb,lb) = range_tup
if lb == '':
if retrlen is None or retrlen == 0:
raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
lb = retrlen
retrlen = lb - fb
if retrlen < 0:
# beginning of range is larger than file
raise RangeError('Requested Range Not Satisfiable')
else:
retrlen = lb - fb
fp = RangeableFileObject(fp, (0,retrlen))
# -- range support modifications end here
headers = ""
mtype = mimetypes.guess_type(req.get_full_url())[0]
if mtype:
headers += "Content-Type: %s\n" % mtype
if retrlen is not None and retrlen >= 0:
headers += "Content-Length: %d\n" % retrlen
sf = StringIO(headers)
headers = mimetools.Message(sf)
return addinfourl(fp, headers, req.get_full_url())
except ftplib.all_errors, msg:
raise IOError, ('ftp error', msg), sys.exc_info()[2]
def connect_ftp(self, user, passwd, host, port, dirs):
fw = ftpwrapper(user, passwd, host, port, dirs)
return fw
class ftpwrapper(urllib.ftpwrapper):
# range support note:
# this ftpwrapper code is copied directly from
# urllib. The only enhancement is to add the rest
# argument and pass it on to ftp.ntransfercmd
def retrfile(self, file, type, rest=None):
self.endtransfer()
if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
else: cmd = 'TYPE ' + type; isdir = 0
try:
self.ftp.voidcmd(cmd)
except ftplib.all_errors:
self.init()
self.ftp.voidcmd(cmd)
conn = None
if file and not isdir:
# Use nlst to see if the file exists at all
try:
self.ftp.nlst(file)
except ftplib.error_perm, reason:
raise IOError, ('ftp error', reason), sys.exc_info()[2]
# Restore the transfer mode!
self.ftp.voidcmd(cmd)
# Try to retrieve as a file
try:
cmd = 'RETR ' + file
conn = self.ftp.ntransfercmd(cmd, rest)
except ftplib.error_perm, reason:
if str(reason)[:3] == '501':
# workaround for REST not supported error
fp, retrlen = self.retrfile(file, type)
fp = RangeableFileObject(fp, (rest,''))
return (fp, retrlen)
elif str(reason)[:3] != '550':
raise IOError, ('ftp error', reason), sys.exc_info()[2]
if not conn:
# Set transfer mode to ASCII!
self.ftp.voidcmd('TYPE A')
# Try a directory listing
if file: cmd = 'LIST ' + file
else: cmd = 'LIST'
conn = self.ftp.ntransfercmd(cmd)
self.busy = 1
# Pass back both a suitably decorated object and a retrieval length
return (addclosehook(conn[0].makefile('rb'),
self.endtransfer), conn[1])
####################################################################
# Range Tuple Functions
# XXX: These range tuple functions might go better in a class.
_rangere = None
def range_header_to_tuple(range_header):
"""Get a (firstbyte,lastbyte) tuple from a Range header value.
Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
function pulls the firstbyte and lastbyte values and returns
a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
the header value, it is returned as an empty string in the
tuple.
Return None if range_header is None
Return () if range_header does not conform to the range spec
pattern.
"""
global _rangere
if range_header is None: return None
if _rangere is None:
import re
_rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
match = _rangere.match(range_header)
if match:
tup = range_tuple_normalize(match.group(1,2))
if tup and tup[1]:
tup = (tup[0],tup[1]+1)
return tup
return ()
def range_tuple_to_header(range_tup):
"""Convert a range tuple to a Range header value.
Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
if no range is needed.
"""
if range_tup is None: return None
range_tup = range_tuple_normalize(range_tup)
if range_tup:
if range_tup[1]:
range_tup = (range_tup[0],range_tup[1] - 1)
return 'bytes=%s-%s' % range_tup
def range_tuple_normalize(range_tup):
"""Normalize a (first_byte,last_byte) range tuple.
Return a tuple whose first element is guaranteed to be an int
and whose second element will be '' (meaning: the last byte) or
an int. Finally, return None if the normalized tuple == (0,'')
as that is equivelant to retrieving the entire file.
"""
if range_tup is None: return None
# handle first byte
fb = range_tup[0]
if fb in (None,''): fb = 0
else: fb = int(fb)
# handle last byte
try: lb = range_tup[1]
except IndexError: lb = ''
else:
if lb is None: lb = ''
elif lb != '': lb = int(lb)
# check if range is over the entire file
if (fb,lb) == (0,''): return None
# check that the range is valid
if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
return (fb,lb)

File diff suppressed because it is too large Load Diff

View File

@@ -1,617 +0,0 @@
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
>>> import urllib2
>>> from keepalive import HTTPHandler
>>> keepalive_handler = HTTPHandler()
>>> opener = urllib2.build_opener(keepalive_handler)
>>> urllib2.install_opener(opener)
>>>
>>> fo = urllib2.urlopen('http://www.python.org')
If a connection to a given host is requested, and all of the existing
connections are still in use, another connection will be opened. If
the handler tries to use an existing connection but it fails in some
way, it will be closed and removed from the pool.
To remove the handler, simply re-run build_opener with no arguments, and
install that opener.
You can explicitly close connections by using the close_connection()
method of the returned file-like object (described below) or you can
use the handler methods:
close_connection(host)
close_all()
open_connections()
NOTE: using the close_connection and close_all methods of the handler
should be done with care when using multiple threads.
* there is nothing that prevents another thread from creating new
connections immediately after connections are closed
* no checks are done to prevent in-use connections from being closed
>>> keepalive_handler.close_all()
EXTRA ATTRIBUTES AND METHODS
Upon a status of 200, the object returned has a few additional
attributes and methods, which should not be used if you want to
remain consistent with the normal urllib2-returned objects:
close_connection() - close the connection to the host
readlines() - you know, readlines()
status - the return status (ie 404)
reason - english translation of status (ie 'File not found')
If you want the best of both worlds, use this inside an
AttributeError-catching try:
>>> try: status = fo.status
>>> except AttributeError: status = None
Unfortunately, these are ONLY there if status == 200, so it's not
easy to distinguish between non-200 responses. The reason is that
urllib2 tries to do clever things with error codes 301, 302, 401,
and 407, and it wraps the object upon return.
For python versions earlier than 2.4, you can avoid this fancy error
handling by setting the module-level global HANDLE_ERRORS to zero.
You see, prior to 2.4, it's the HTTP Handler's job to determine what
to handle specially, and what to just pass up. HANDLE_ERRORS == 0
means "pass everything up". In python 2.4, however, this job no
longer belongs to the HTTP Handler and is now done by a NEW handler,
HTTPErrorProcessor. Here's the bottom line:
python version < 2.4
HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
errors
HANDLE_ERRORS == 0 pass everything up, error processing is
left to the calling code
python version >= 2.4
HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
HANDLE_ERRORS == 0 (default) pass everything up, let the
other handlers (specifically,
HTTPErrorProcessor) decide what to do
In practice, setting the variable either way makes little difference
in python 2.4, so for the most consistent behavior across versions,
you probably just want to use the defaults, which will give you
exceptions on errors.
"""
# $Id: keepalive.py,v 1.16 2006/09/22 00:58:05 mstenner Exp $
import urllib2
import httplib
import socket
import thread
DEBUG = None
import sslfactory
import sys
if sys.version_info < (2, 4): HANDLE_ERRORS = 1
else: HANDLE_ERRORS = 0
class ConnectionManager:
"""
The connection manager must be able to:
* keep track of all existing
"""
def __init__(self):
self._lock = thread.allocate_lock()
self._hostmap = {} # map hosts to a list of connections
self._connmap = {} # map connections to host
self._readymap = {} # map connection to ready state
def add(self, host, connection, ready):
self._lock.acquire()
try:
if not self._hostmap.has_key(host): self._hostmap[host] = []
self._hostmap[host].append(connection)
self._connmap[connection] = host
self._readymap[connection] = ready
finally:
self._lock.release()
def remove(self, connection):
self._lock.acquire()
try:
try:
host = self._connmap[connection]
except KeyError:
pass
else:
del self._connmap[connection]
del self._readymap[connection]
self._hostmap[host].remove(connection)
if not self._hostmap[host]: del self._hostmap[host]
finally:
self._lock.release()
def set_ready(self, connection, ready):
try: self._readymap[connection] = ready
except KeyError: pass
def get_ready_conn(self, host):
conn = None
self._lock.acquire()
try:
if self._hostmap.has_key(host):
for c in self._hostmap[host]:
if self._readymap[c]:
self._readymap[c] = 0
conn = c
break
finally:
self._lock.release()
return conn
def get_all(self, host=None):
if host:
return list(self._hostmap.get(host, []))
else:
return dict(self._hostmap)
class KeepAliveHandler:
def __init__(self):
self._cm = ConnectionManager()
#### Connection Management
def open_connections(self):
"""return a list of connected hosts and the number of connections
to each. [('foo.com:80', 2), ('bar.org', 1)]"""
return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
def close_connection(self, host):
"""close connection(s) to <host>
host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
no error occurs if there is no connection to that host."""
for h in self._cm.get_all(host):
self._cm.remove(h)
h.close()
def close_all(self):
"""close all open connections"""
for host, conns in self._cm.get_all().items():
for h in conns:
self._cm.remove(h)
h.close()
def _request_closed(self, request, host, connection):
"""tells us that this request is now closed and the the
connection is ready for another request"""
self._cm.set_ready(connection, 1)
def _remove_connection(self, host, connection, close=0):
if close: connection.close()
self._cm.remove(connection)
#### Transaction Execution
def do_open(self, req):
host = req.get_host()
if not host:
raise urllib2.URLError('no host given')
try:
h = self._cm.get_ready_conn(host)
while h:
r = self._reuse_connection(h, req, host)
# if this response is non-None, then it worked and we're
# done. Break out, skipping the else block.
if r: break
# connection is bad - possibly closed by server
# discard it and ask for the next free connection
h.close()
self._cm.remove(h)
h = self._cm.get_ready_conn(host)
else:
# no (working) free connections were found. Create a new one.
h = self._get_connection(host)
if DEBUG: DEBUG.info("creating new connection to %s (%d)",
host, id(h))
self._cm.add(host, h, 0)
self._start_transaction(h, req)
r = h.getresponse()
except (socket.error, httplib.HTTPException), err:
raise urllib2.URLError(err)
# if not a persistent connection, don't try to reuse it
if r.will_close: self._cm.remove(h)
if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
r._handler = self
r._host = host
r._url = req.get_full_url()
r._connection = h
r.code = r.status
r.headers = r.msg
r.msg = r.reason
if r.status == 200 or not HANDLE_ERRORS:
return r
else:
return self.parent.error('http', req, r,
r.status, r.msg, r.headers)
def _reuse_connection(self, h, req, host):
"""start the transaction with a re-used connection
return a response object (r) upon success or None on failure.
This DOES not close or remove bad connections in cases where
it returns. However, if an unexpected exception occurs, it
will close and remove the connection before re-raising.
"""
try:
self._start_transaction(h, req)
r = h.getresponse()
# note: just because we got something back doesn't mean it
# worked. We'll check the version below, too.
except (socket.error, httplib.HTTPException):
r = None
except:
# adding this block just in case we've missed
# something we will still raise the exception, but
# lets try and close the connection and remove it
# first. We previously got into a nasty loop
# where an exception was uncaught, and so the
# connection stayed open. On the next try, the
# same exception was raised, etc. The tradeoff is
# that it's now possible this call will raise
# a DIFFERENT exception
if DEBUG: DEBUG.error("unexpected exception - closing " + \
"connection to %s (%d)", host, id(h))
self._cm.remove(h)
h.close()
raise
if r is None or r.version == 9:
# httplib falls back to assuming HTTP 0.9 if it gets a
# bad header back. This is most likely to happen if
# the socket has been closed by the server since we
# last used the connection.
if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
host, id(h))
r = None
else:
if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
return r
def _start_transaction(self, h, req):
try:
if req.has_data():
data = req.get_data()
h.putrequest('POST', req.get_selector())
if not req.headers.has_key('Content-type'):
h.putheader('Content-type',
'application/x-www-form-urlencoded')
if not req.headers.has_key('Content-length'):
h.putheader('Content-length', '%d' % len(data))
else:
h.putrequest('GET', req.get_selector())
except (socket.error, httplib.HTTPException), err:
raise urllib2.URLError(err)
for args in self.parent.addheaders:
h.putheader(*args)
for k, v in req.headers.items():
h.putheader(k, v)
h.endheaders()
if req.has_data():
h.send(data)
def _get_connection(self, host):
return NotImplementedError
class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
def __init__(self):
KeepAliveHandler.__init__(self)
def http_open(self, req):
return self.do_open(req)
def _get_connection(self, host):
return HTTPConnection(host)
class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
def __init__(self, ssl_factory=None):
KeepAliveHandler.__init__(self)
if not ssl_factory:
ssl_factory = sslfactory.get_factory()
self._ssl_factory = ssl_factory
def https_open(self, req):
return self.do_open(req)
def _get_connection(self, host):
return self._ssl_factory.get_https_connection(host)
class HTTPResponse(httplib.HTTPResponse):
# we need to subclass HTTPResponse in order to
# 1) add readline() and readlines() methods
# 2) add close_connection() methods
# 3) add info() and geturl() methods
# in order to add readline(), read must be modified to deal with a
# buffer. example: readline must read a buffer and then spit back
# one line at a time. The only real alternative is to read one
# BYTE at a time (ick). Once something has been read, it can't be
# put back (ok, maybe it can, but that's even uglier than this),
# so if you THEN do a normal read, you must first take stuff from
# the buffer.
# the read method wraps the original to accomodate buffering,
# although read() never adds to the buffer.
# Both readline and readlines have been stolen with almost no
# modification from socket.py
def __init__(self, sock, debuglevel=0, strict=0, method=None):
if method: # the httplib in python 2.3 uses the method arg
httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
else: # 2.2 doesn't
httplib.HTTPResponse.__init__(self, sock, debuglevel)
self.fileno = sock.fileno
self.code = None
self._rbuf = ''
self._rbufsize = 8096
self._handler = None # inserted by the handler later
self._host = None # (same)
self._url = None # (same)
self._connection = None # (same)
_raw_read = httplib.HTTPResponse.read
def close(self):
if self.fp:
self.fp.close()
self.fp = None
if self._handler:
self._handler._request_closed(self, self._host,
self._connection)
def close_connection(self):
self._handler._remove_connection(self._host, self._connection, close=1)
self.close()
def info(self):
return self.headers
def geturl(self):
return self._url
def read(self, amt=None):
# the _rbuf test is only in this first if for speed. It's not
# logically necessary
if self._rbuf and not amt is None:
L = len(self._rbuf)
if amt > L:
amt -= L
else:
s = self._rbuf[:amt]
self._rbuf = self._rbuf[amt:]
return s
s = self._rbuf + self._raw_read(amt)
self._rbuf = ''
return s
def readline(self, limit=-1):
data = ""
i = self._rbuf.find('\n')
while i < 0 and not (0 < limit <= len(self._rbuf)):
new = self._raw_read(self._rbufsize)
if not new: break
i = new.find('\n')
if i >= 0: i = i + len(self._rbuf)
self._rbuf = self._rbuf + new
if i < 0: i = len(self._rbuf)
else: i = i+1
if 0 <= limit < len(self._rbuf): i = limit
data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
return data
def readlines(self, sizehint = 0):
total = 0
list = []
while 1:
line = self.readline()
if not line: break
list.append(line)
total += len(line)
if sizehint and total >= sizehint:
break
return list
class HTTPConnection(httplib.HTTPConnection):
# use the modified response class
response_class = HTTPResponse
class HTTPSConnection(httplib.HTTPSConnection):
response_class = HTTPResponse
#########################################################################
##### TEST FUNCTIONS
#########################################################################
def error_handler(url):
global HANDLE_ERRORS
orig = HANDLE_ERRORS
keepalive_handler = HTTPHandler()
opener = urllib2.build_opener(keepalive_handler)
urllib2.install_opener(opener)
pos = {0: 'off', 1: 'on'}
for i in (0, 1):
print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
HANDLE_ERRORS = i
try:
fo = urllib2.urlopen(url)
foo = fo.read()
fo.close()
try: status, reason = fo.status, fo.reason
except AttributeError: status, reason = None, None
except IOError, e:
print " EXCEPTION: %s" % e
raise
else:
print " status = %s, reason = %s" % (status, reason)
HANDLE_ERRORS = orig
hosts = keepalive_handler.open_connections()
print "open connections:", hosts
keepalive_handler.close_all()
def continuity(url):
import md5
format = '%25s: %s'
# first fetch the file with the normal http handler
opener = urllib2.build_opener()
urllib2.install_opener(opener)
fo = urllib2.urlopen(url)
foo = fo.read()
fo.close()
m = md5.new(foo)
print format % ('normal urllib', m.hexdigest())
# now install the keepalive handler and try again
opener = urllib2.build_opener(HTTPHandler())
urllib2.install_opener(opener)
fo = urllib2.urlopen(url)
foo = fo.read()
fo.close()
m = md5.new(foo)
print format % ('keepalive read', m.hexdigest())
fo = urllib2.urlopen(url)
foo = ''
while 1:
f = fo.readline()
if f: foo = foo + f
else: break
fo.close()
m = md5.new(foo)
print format % ('keepalive readline', m.hexdigest())
def comp(N, url):
print ' making %i connections to:\n %s' % (N, url)
sys.stdout.write(' first using the normal urllib handlers')
# first use normal opener
opener = urllib2.build_opener()
urllib2.install_opener(opener)
t1 = fetch(N, url)
print ' TIME: %.3f s' % t1
sys.stdout.write(' now using the keepalive handler ')
# now install the keepalive handler and try again
opener = urllib2.build_opener(HTTPHandler())
urllib2.install_opener(opener)
t2 = fetch(N, url)
print ' TIME: %.3f s' % t2
print ' improvement factor: %.2f' % (t1/t2, )
def fetch(N, url, delay=0):
import time
lens = []
starttime = time.time()
for i in range(N):
if delay and i > 0: time.sleep(delay)
fo = urllib2.urlopen(url)
foo = fo.read()
fo.close()
lens.append(len(foo))
diff = time.time() - starttime
j = 0
for i in lens[1:]:
j = j + 1
if not i == lens[0]:
print "WARNING: inconsistent length on read %i: %i" % (j, i)
return diff
def test_timeout(url):
global DEBUG
dbbackup = DEBUG
class FakeLogger:
def debug(self, msg, *args): print msg % args
info = warning = error = debug
DEBUG = FakeLogger()
print " fetching the file to establish a connection"
fo = urllib2.urlopen(url)
data1 = fo.read()
fo.close()
i = 20
print " waiting %i seconds for the server to close the connection" % i
while i > 0:
sys.stdout.write('\r %2i' % i)
sys.stdout.flush()
time.sleep(1)
i -= 1
sys.stderr.write('\r')
print " fetching the file a second time"
fo = urllib2.urlopen(url)
data2 = fo.read()
fo.close()
if data1 == data2:
print ' data are identical'
else:
print ' ERROR: DATA DIFFER'
DEBUG = dbbackup
def test(url, N=10):
print "checking error hander (do this on a non-200)"
try: error_handler(url)
except IOError, e:
print "exiting - exception will prevent further tests"
sys.exit()
print
print "performing continuity test (making sure stuff isn't corrupted)"
continuity(url)
print
print "performing speed comparison"
comp(N, url)
print
print "performing dropped-connection check"
test_timeout(url)
if __name__ == '__main__':
import time
import sys
try:
N = int(sys.argv[1])
url = sys.argv[2]
except:
print "%s <integer> <url>" % sys.argv[0]
else:
test(url, N)

View File

@@ -1,458 +0,0 @@
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
"""Module for downloading files from a pool of mirrors
DESCRIPTION
This module provides support for downloading files from a pool of
mirrors with configurable failover policies. To a large extent, the
failover policy is chosen by using different classes derived from
the main class, MirrorGroup.
Instances of MirrorGroup (and cousins) act very much like URLGrabber
instances in that they have urlread, urlgrab, and urlopen methods.
They can therefore, be used in very similar ways.
from urlgrabber.grabber import URLGrabber
from urlgrabber.mirror import MirrorGroup
gr = URLGrabber()
mg = MirrorGroup(gr, ['http://foo.com/some/directory/',
'http://bar.org/maybe/somewhere/else/',
'ftp://baz.net/some/other/place/entirely/']
mg.urlgrab('relative/path.zip')
The assumption is that all mirrors are identical AFTER the base urls
specified, so that any mirror can be used to fetch any file.
FAILOVER
The failover mechanism is designed to be customized by subclassing
from MirrorGroup to change the details of the behavior. In general,
the classes maintain a master mirror list and a "current mirror"
index. When a download is initiated, a copy of this list and index
is created for that download only. The specific failover policy
depends on the class used, and so is documented in the class
documentation. Note that ANY behavior of the class can be
overridden, so any failover policy at all is possible (although
you may need to change the interface in extreme cases).
CUSTOMIZATION
Most customization of a MirrorGroup object is done at instantiation
time (or via subclassing). There are four major types of
customization:
1) Pass in a custom urlgrabber - The passed in urlgrabber will be
used (by default... see #2) for the grabs, so options to it
apply for the url-fetching
2) Custom mirror list - Mirror lists can simply be a list of
stings mirrors (as shown in the example above) but each can
also be a dict, allowing for more options. For example, the
first mirror in the list above could also have been:
{'mirror': 'http://foo.com/some/directory/',
'grabber': <a custom grabber to be used for this mirror>,
'kwargs': { <a dict of arguments passed to the grabber> }}
All mirrors are converted to this format internally. If
'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used.
3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument.
4) Finally, any kwargs passed in for the specific file (to the
urlgrab method, for example) will be folded in. The options
passed into the grabber's urlXXX methods will override any
options specified in a custom mirror dict.
"""
# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $
import random
import thread # needed for locking to make this threadsafe
from grabber import URLGrabError, CallbackObject, DEBUG
try:
from i18n import _
except ImportError, msg:
def _(st): return st
class GrabRequest:
"""This is a dummy class used to hold information about the specific
request. For example, a single file. By maintaining this information
separately, we can accomplish two things:
1) make it a little easier to be threadsafe
2) have request-specific parameters
"""
pass
class MirrorGroup:
"""Base Mirror class
Instances of this class are built with a grabber object and a list
of mirrors. Then all calls to urlXXX should be passed relative urls.
The requested file will be searched for on the first mirror. If the
grabber raises an exception (possibly after some retries) then that
mirror will be removed from the list, and the next will be attempted.
If all mirrors are exhausted, then an exception will be raised.
MirrorGroup has the following failover policy:
* downloads begin with the first mirror
* by default (see default_action below) a failure (after retries)
causes it to increment the local AND master indices. Also,
the current mirror is removed from the local list (but NOT the
master list - the mirror can potentially be used for other
files)
* if the local list is ever exhausted, a URLGrabError will be
raised (errno=256, no more mirrors)
OPTIONS
In addition to the required arguments "grabber" and "mirrors",
MirrorGroup also takes the following optional arguments:
default_action
A dict that describes the actions to be taken upon failure
(after retries). default_action can contain any of the
following keys (shown here with their default values):
default_action = {'increment': 1,
'increment_master': 1,
'remove': 1,
'remove_master': 0,
'fail': 0}
In this context, 'increment' means "use the next mirror" and
'remove' means "never use this mirror again". The two
'master' values refer to the instance-level mirror list (used
for all files), whereas the non-master values refer to the
current download only.
The 'fail' option will cause immediate failure by re-raising
the exception and no further attempts to get the current
download.
This dict can be set at instantiation time,
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
at method-execution time (only applies to current fetch),
filename = mg.urlgrab(url, default_action={'increment': 0})
or by returning an action dict from the failure_callback
return {'fail':0}
in increasing precedence.
If all three of these were done, the net result would be:
{'increment': 0, # set in method
'increment_master': 1, # class default
'remove': 1, # class default
'remove_master': 0, # class default
'fail': 0} # set at instantiation, reset
# from callback
failure_callback
this is a callback that will be called when a mirror "fails",
meaning the grabber raises some URLGrabError. If this is a
tuple, it is interpreted to be of the form (cb, args, kwargs)
where cb is the actual callable object (function, method,
etc). Otherwise, it is assumed to be the callable object
itself. The callback will be passed a grabber.CallbackObject
instance along with args and kwargs (if present). The following
attributes are defined withing the instance:
obj.exception = < exception that was raised >
obj.mirror = < the mirror that was tried >
obj.relative_url = < url relative to the mirror >
obj.url = < full url that failed >
# .url is just the combination of .mirror
# and .relative_url
The failure callback can return an action dict, as described
above.
Like default_action, the failure_callback can be set at
instantiation time or when the urlXXX method is called. In
the latter case, it applies only for that fetch.
The callback can re-raise the exception quite easily. For
example, this is a perfectly adequate callback function:
def callback(obj): raise obj.exception
WARNING: do not save the exception object (or the
CallbackObject instance). As they contain stack frame
references, they can lead to circular references.
Notes:
* The behavior can be customized by deriving and overriding the
'CONFIGURATION METHODS'
* The 'grabber' instance is kept as a reference, not copied.
Therefore, the grabber instance can be modified externally
and changes will take effect immediately.
"""
# notes on thread-safety:
# A GrabRequest should never be shared by multiple threads because
# it's never saved inside the MG object and never returned outside it.
# therefore, it should be safe to access/modify grabrequest data
# without a lock. However, accessing the mirrors and _next attributes
# of the MG itself must be done when locked to prevent (for example)
# removal of the wrong mirror.
##############################################################
# CONFIGURATION METHODS - intended to be overridden to
# customize behavior
def __init__(self, grabber, mirrors, **kwargs):
"""Initialize the MirrorGroup object.
REQUIRED ARGUMENTS
grabber - URLGrabber instance
mirrors - a list of mirrors
OPTIONAL ARGUMENTS
failure_callback - callback to be used when a mirror fails
default_action - dict of failure actions
See the module-level and class level documentation for more
details.
"""
# OVERRIDE IDEAS:
# shuffle the list to randomize order
self.grabber = grabber
self.mirrors = self._parse_mirrors(mirrors)
self._next = 0
self._lock = thread.allocate_lock()
self.default_action = None
self._process_kwargs(kwargs)
# if these values are found in **kwargs passed to one of the urlXXX
# methods, they will be stripped before getting passed on to the
# grabber
options = ['default_action', 'failure_callback']
def _process_kwargs(self, kwargs):
self.failure_callback = kwargs.get('failure_callback')
self.default_action = kwargs.get('default_action')
def _parse_mirrors(self, mirrors):
parsed_mirrors = []
for m in mirrors:
if type(m) == type(''): m = {'mirror': m}
parsed_mirrors.append(m)
return parsed_mirrors
def _load_gr(self, gr):
# OVERRIDE IDEAS:
# shuffle gr list
self._lock.acquire()
gr.mirrors = list(self.mirrors)
gr._next = self._next
self._lock.release()
def _get_mirror(self, gr):
# OVERRIDE IDEAS:
# return a random mirror so that multiple mirrors get used
# even without failures.
if not gr.mirrors:
raise URLGrabError(256, _('No more mirrors to try.'))
return gr.mirrors[gr._next]
def _failure(self, gr, cb_obj):
# OVERRIDE IDEAS:
# inspect the error - remove=1 for 404, remove=2 for connection
# refused, etc. (this can also be done via
# the callback)
cb = gr.kw.get('failure_callback') or self.failure_callback
if cb:
if type(cb) == type( () ):
cb, args, kwargs = cb
else:
args, kwargs = (), {}
action = cb(cb_obj, *args, **kwargs) or {}
else:
action = {}
# XXXX - decide - there are two ways to do this
# the first is action-overriding as a whole - use the entire action
# or fall back on module level defaults
#action = action or gr.kw.get('default_action') or self.default_action
# the other is to fall through for each element in the action dict
a = dict(self.default_action or {})
a.update(gr.kw.get('default_action', {}))
a.update(action)
action = a
self.increment_mirror(gr, action)
if action and action.get('fail', 0): raise
def increment_mirror(self, gr, action={}):
"""Tell the mirror object increment the mirror index
This increments the mirror index, which amounts to telling the
mirror object to use a different mirror (for this and future
downloads).
This is a SEMI-public method. It will be called internally,
and you may never need to call it. However, it is provided
(and is made public) so that the calling program can increment
the mirror choice for methods like urlopen. For example, with
urlopen, there's no good way for the mirror group to know that
an error occurs mid-download (it's already returned and given
you the file object).
remove --- can have several values
0 do not remove the mirror from the list
1 remove the mirror for this download only
2 remove the mirror permanently
beware of remove=0 as it can lead to infinite loops
"""
badmirror = gr.mirrors[gr._next]
self._lock.acquire()
try:
ind = self.mirrors.index(badmirror)
except ValueError:
pass
else:
if action.get('remove_master', 0):
del self.mirrors[ind]
elif self._next == ind and action.get('increment_master', 1):
self._next += 1
if self._next >= len(self.mirrors): self._next = 0
self._lock.release()
if action.get('remove', 1):
del gr.mirrors[gr._next]
elif action.get('increment', 1):
gr._next += 1
if gr._next >= len(gr.mirrors): gr._next = 0
if DEBUG:
grm = [m['mirror'] for m in gr.mirrors]
DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
selfm = [m['mirror'] for m in self.mirrors]
DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
#####################################################################
# NON-CONFIGURATION METHODS
# these methods are designed to be largely workhorse methods that
# are not intended to be overridden. That doesn't mean you can't;
# if you want to, feel free, but most things can be done by
# by overriding the configuration methods :)
def _join_url(self, base_url, rel_url):
if base_url.endswith('/') or rel_url.startswith('/'):
return base_url + rel_url
else:
return base_url + '/' + rel_url
def _mirror_try(self, func, url, kw):
gr = GrabRequest()
gr.func = func
gr.url = url
gr.kw = dict(kw)
self._load_gr(gr)
for k in self.options:
try: del kw[k]
except KeyError: pass
while 1:
mirrorchoice = self._get_mirror(gr)
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
kwargs = dict(mirrorchoice.get('kwargs', {}))
kwargs.update(kw)
grabber = mirrorchoice.get('grabber') or self.grabber
func_ref = getattr(grabber, func)
if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
try:
return func_ref( *(fullurl,), **kwargs )
except URLGrabError, e:
if DEBUG: DEBUG.info('MIRROR: failed')
obj = CallbackObject()
obj.exception = e
obj.mirror = mirrorchoice['mirror']
obj.relative_url = gr.url
obj.url = fullurl
self._failure(gr, obj)
def urlgrab(self, url, filename=None, **kwargs):
kw = dict(kwargs)
kw['filename'] = filename
func = 'urlgrab'
return self._mirror_try(func, url, kw)
def urlopen(self, url, **kwargs):
kw = dict(kwargs)
func = 'urlopen'
return self._mirror_try(func, url, kw)
def urlread(self, url, limit=None, **kwargs):
kw = dict(kwargs)
kw['limit'] = limit
func = 'urlread'
return self._mirror_try(func, url, kw)
class MGRandomStart(MirrorGroup):
"""A mirror group that starts at a random mirror in the list.
This behavior of this class is identical to MirrorGroup, except that
it starts at a random location in the mirror list.
"""
def __init__(self, grabber, mirrors, **kwargs):
"""Initialize the object
The arguments for intialization are the same as for MirrorGroup
"""
MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
self._next = random.randrange(len(mirrors))
class MGRandomOrder(MirrorGroup):
"""A mirror group that uses mirrors in a random order.
This behavior of this class is identical to MirrorGroup, except that
it uses the mirrors in a random order. Note that the order is set at
initialization time and fixed thereafter. That is, it does not pick a
random mirror after each failure.
"""
def __init__(self, grabber, mirrors, **kwargs):
"""Initialize the object
The arguments for intialization are the same as for MirrorGroup
"""
MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
random.shuffle(self.mirrors)
if __name__ == '__main__':
pass

View File

@@ -1,530 +0,0 @@
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
# $Id: progress.py,v 1.7 2005/08/19 21:59:07 mstenner Exp $
import sys
import time
import math
import thread
class BaseMeter:
def __init__(self):
self.update_period = 0.3 # seconds
self.filename = None
self.url = None
self.basename = None
self.text = None
self.size = None
self.start_time = None
self.last_amount_read = 0
self.last_update_time = None
self.re = RateEstimator()
def start(self, filename=None, url=None, basename=None,
size=None, now=None, text=None):
self.filename = filename
self.url = url
self.basename = basename
self.text = text
#size = None ######### TESTING
self.size = size
if not size is None: self.fsize = format_number(size) + 'B'
if now is None: now = time.time()
self.start_time = now
self.re.start(size, now)
self.last_amount_read = 0
self.last_update_time = now
self._do_start(now)
def _do_start(self, now=None):
pass
def update(self, amount_read, now=None):
# for a real gui, you probably want to override and put a call
# to your mainloop iteration function here
if now is None: now = time.time()
if (now >= self.last_update_time + self.update_period) or \
not self.last_update_time:
self.re.update(amount_read, now)
self.last_amount_read = amount_read
self.last_update_time = now
self._do_update(amount_read, now)
def _do_update(self, amount_read, now=None):
pass
def end(self, amount_read, now=None):
if now is None: now = time.time()
self.re.update(amount_read, now)
self.last_amount_read = amount_read
self.last_update_time = now
self._do_end(amount_read, now)
def _do_end(self, amount_read, now=None):
pass
class TextMeter(BaseMeter):
def __init__(self, fo=sys.stderr):
BaseMeter.__init__(self)
self.fo = fo
def _do_update(self, amount_read, now=None):
etime = self.re.elapsed_time()
fetime = format_time(etime)
fread = format_number(amount_read)
#self.size = None
if self.text is not None:
text = self.text
else:
text = self.basename
if self.size is None:
out = '\r%-60.60s %5sB %s ' % \
(text, fread, fetime)
else:
rtime = self.re.remaining_time()
frtime = format_time(rtime)
frac = self.re.fraction_read()
bar = '='*int(25 * frac)
out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \
(text, frac*100, bar, fread, frtime)
self.fo.write(out)
self.fo.flush()
def _do_end(self, amount_read, now=None):
total_time = format_time(self.re.elapsed_time())
total_size = format_number(amount_read)
if self.text is not None:
text = self.text
else:
text = self.basename
if self.size is None:
out = '\r%-60.60s %5sB %s ' % \
(text, total_size, total_time)
else:
bar = '='*25
out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \
(text, 100, bar, total_size, total_time)
self.fo.write(out + '\n')
self.fo.flush()
text_progress_meter = TextMeter
class MultiFileHelper(BaseMeter):
def __init__(self, master):
BaseMeter.__init__(self)
self.master = master
def _do_start(self, now):
self.master.start_meter(self, now)
def _do_update(self, amount_read, now):
# elapsed time since last update
self.master.update_meter(self, now)
def _do_end(self, amount_read, now):
self.ftotal_time = format_time(now - self.start_time)
self.ftotal_size = format_number(self.last_amount_read)
self.master.end_meter(self, now)
def failure(self, message, now=None):
self.master.failure_meter(self, message, now)
def message(self, message):
self.master.message_meter(self, message)
class MultiFileMeter:
helperclass = MultiFileHelper
def __init__(self):
self.meters = []
self.in_progress_meters = []
self._lock = thread.allocate_lock()
self.update_period = 0.3 # seconds
self.numfiles = None
self.finished_files = 0
self.failed_files = 0
self.open_files = 0
self.total_size = None
self.failed_size = 0
self.start_time = None
self.finished_file_size = 0
self.last_update_time = None
self.re = RateEstimator()
def start(self, numfiles=None, total_size=None, now=None):
if now is None: now = time.time()
self.numfiles = numfiles
self.finished_files = 0
self.failed_files = 0
self.open_files = 0
self.total_size = total_size
self.failed_size = 0
self.start_time = now
self.finished_file_size = 0
self.last_update_time = now
self.re.start(total_size, now)
self._do_start(now)
def _do_start(self, now):
pass
def end(self, now=None):
if now is None: now = time.time()
self._do_end(now)
def _do_end(self, now):
pass
def lock(self): self._lock.acquire()
def unlock(self): self._lock.release()
###########################################################
# child meter creation and destruction
def newMeter(self):
newmeter = self.helperclass(self)
self.meters.append(newmeter)
return newmeter
def removeMeter(self, meter):
self.meters.remove(meter)
###########################################################
# child functions - these should only be called by helpers
def start_meter(self, meter, now):
if not meter in self.meters:
raise ValueError('attempt to use orphaned meter')
self._lock.acquire()
try:
if not meter in self.in_progress_meters:
self.in_progress_meters.append(meter)
self.open_files += 1
finally:
self._lock.release()
self._do_start_meter(meter, now)
def _do_start_meter(self, meter, now):
pass
def update_meter(self, meter, now):
if not meter in self.meters:
raise ValueError('attempt to use orphaned meter')
if (now >= self.last_update_time + self.update_period) or \
not self.last_update_time:
self.re.update(self._amount_read(), now)
self.last_update_time = now
self._do_update_meter(meter, now)
def _do_update_meter(self, meter, now):
pass
def end_meter(self, meter, now):
if not meter in self.meters:
raise ValueError('attempt to use orphaned meter')
self._lock.acquire()
try:
try: self.in_progress_meters.remove(meter)
except ValueError: pass
self.open_files -= 1
self.finished_files += 1
self.finished_file_size += meter.last_amount_read
finally:
self._lock.release()
self._do_end_meter(meter, now)
def _do_end_meter(self, meter, now):
pass
def failure_meter(self, meter, message, now):
if not meter in self.meters:
raise ValueError('attempt to use orphaned meter')
self._lock.acquire()
try:
try: self.in_progress_meters.remove(meter)
except ValueError: pass
self.open_files -= 1
self.failed_files += 1
if meter.size and self.failed_size is not None:
self.failed_size += meter.size
else:
self.failed_size = None
finally:
self._lock.release()
self._do_failure_meter(meter, message, now)
def _do_failure_meter(self, meter, message, now):
pass
def message_meter(self, meter, message):
pass
########################################################
# internal functions
def _amount_read(self):
tot = self.finished_file_size
for m in self.in_progress_meters:
tot += m.last_amount_read
return tot
class TextMultiFileMeter(MultiFileMeter):
def __init__(self, fo=sys.stderr):
self.fo = fo
MultiFileMeter.__init__(self)
# files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
def _do_update_meter(self, meter, now):
self._lock.acquire()
try:
format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
"time: %8.8s/%8.8s"
df = self.finished_files
tf = self.numfiles or 1
pf = 100 * float(df)/tf + 0.49
dd = self.re.last_amount_read
td = self.total_size
pd = 100 * (self.re.fraction_read() or 0) + 0.49
dt = self.re.elapsed_time()
rt = self.re.remaining_time()
if rt is None: tt = None
else: tt = dt + rt
fdd = format_number(dd) + 'B'
ftd = format_number(td) + 'B'
fdt = format_time(dt, 1)
ftt = format_time(tt, 1)
out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
self.fo.write('\r' + out)
self.fo.flush()
finally:
self._lock.release()
def _do_end_meter(self, meter, now):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %8.8s %9.9s"
fn = meter.basename
size = meter.last_amount_read
fsize = format_number(size) + 'B'
et = meter.re.elapsed_time()
fet = format_time(et, 1)
frate = format_number(size / et) + 'B/s'
out = '%-79.79s' % (format % (fn, fsize, fet, frate))
self.fo.write('\r' + out + '\n')
finally:
self._lock.release()
self._do_update_meter(meter, now)
def _do_failure_meter(self, meter, message, now):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %s"
fn = meter.basename
if type(message) in (type(''), type(u'')):
message = message.splitlines()
if not message: message = ['']
out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
self.fo.write('\r' + out + '\n')
for m in message[1:]: self.fo.write(' ' + m + '\n')
self._lock.release()
finally:
self._do_update_meter(meter, now)
def message_meter(self, meter, message):
self._lock.acquire()
try:
pass
finally:
self._lock.release()
def _do_end(self, now):
self._do_update_meter(None, now)
self._lock.acquire()
try:
self.fo.write('\n')
self.fo.flush()
finally:
self._lock.release()
######################################################################
# support classes and functions
class RateEstimator:
def __init__(self, timescale=5.0):
self.timescale = timescale
def start(self, total=None, now=None):
if now is None: now = time.time()
self.total = total
self.start_time = now
self.last_update_time = now
self.last_amount_read = 0
self.ave_rate = None
def update(self, amount_read, now=None):
if now is None: now = time.time()
if amount_read == 0:
# if we just started this file, all bets are off
self.last_update_time = now
self.last_amount_read = 0
self.ave_rate = None
return
#print 'times', now, self.last_update_time
time_diff = now - self.last_update_time
read_diff = amount_read - self.last_amount_read
self.last_update_time = now
self.last_amount_read = amount_read
self.ave_rate = self._temporal_rolling_ave(\
time_diff, read_diff, self.ave_rate, self.timescale)
#print 'results', time_diff, read_diff, self.ave_rate
#####################################################################
# result methods
def average_rate(self):
"get the average transfer rate (in bytes/second)"
return self.ave_rate
def elapsed_time(self):
"the time between the start of the transfer and the most recent update"
return self.last_update_time - self.start_time
def remaining_time(self):
"estimated time remaining"
if not self.ave_rate or not self.total: return None
return (self.total - self.last_amount_read) / self.ave_rate
def fraction_read(self):
"""the fraction of the data that has been read
(can be None for unknown transfer size)"""
if self.total is None: return None
elif self.total == 0: return 1.0
else: return float(self.last_amount_read)/self.total
#########################################################################
# support methods
def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale):
"""a temporal rolling average performs smooth averaging even when
updates come at irregular intervals. This is performed by scaling
the "epsilon" according to the time since the last update.
Specifically, epsilon = time_diff / timescale
As a general rule, the average will take on a completely new value
after 'timescale' seconds."""
epsilon = time_diff / timescale
if epsilon > 1: epsilon = 1.0
return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
"""perform a "rolling average" iteration
a rolling average "folds" new data into an existing average with
some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive)
a value of 0.0 means only the old value (initial value) counts,
and a value of 1.0 means only the newest value is considered."""
try:
recent_rate = read_diff / time_diff
except ZeroDivisionError:
recent_rate = None
if last_ave is None: return recent_rate
elif recent_rate is None: return last_ave
# at this point, both last_ave and recent_rate are numbers
return epsilon * recent_rate + (1 - epsilon) * last_ave
def _round_remaining_time(self, rt, start_time=15.0):
"""round the remaining time, depending on its size
If rt is between n*start_time and (n+1)*start_time round downward
to the nearest multiple of n (for any counting number n).
If rt < start_time, round down to the nearest 1.
For example (for start_time = 15.0):
2.7 -> 2.0
25.2 -> 25.0
26.4 -> 26.0
35.3 -> 34.0
63.6 -> 60.0
"""
if rt < 0: return 0.0
shift = int(math.log(rt/start_time)/math.log(2))
rt = int(rt)
if shift <= 0: return rt
return float(int(rt) >> shift << shift)
def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
if use_hours: return '--:--:--'
else: return '--:--'
else:
seconds = int(seconds)
minutes = seconds / 60
seconds = seconds % 60
if use_hours:
hours = minutes / 60
minutes = minutes % 60
return '%02i:%02i:%02i' % (hours, minutes, seconds)
else:
return '%02i:%02i' % (minutes, seconds)
def format_number(number, SI=0, space=' '):
"""Turn numbers into human-readable metric-like numbers"""
symbols = ['', # (none)
'k', # kilo
'M', # mega
'G', # giga
'T', # tera
'P', # peta
'E', # exa
'Z', # zetta
'Y'] # yotta
if SI: step = 1000.0
else: step = 1024.0
thresh = 999
depth = 0
max_depth = len(symbols) - 1
# we want numbers between 0 and thresh, but don't exceed the length
# of our list. In that event, the formatting will be screwed up,
# but it'll still show the right number.
while number > thresh and depth < max_depth:
depth = depth + 1
number = number / step
if type(number) == type(1) or type(number) == type(1L):
# it's an int or a long, which means it didn't get divided,
# which means it's already short enough
format = '%i%s%s'
elif number < 9.95:
# must use 9.95 for proper sizing. For example, 9.99 will be
# rounded to 10.0 with the .1f format string (which is too long)
format = '%.1f%s%s'
else:
format = '%.0f%s%s'
return(format % (float(number or 0), space, symbols[depth]))

View File

@@ -1,90 +0,0 @@
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
import httplib
import urllib2
try:
from M2Crypto import SSL
from M2Crypto import httpslib
from M2Crypto import m2urllib2
SSL.Connection.clientPostConnectionCheck = None
have_m2crypto = True
except ImportError:
have_m2crypto = False
DEBUG = None
if have_m2crypto:
class M2SSLFactory:
def __init__(self, ssl_ca_cert, ssl_context):
self.ssl_context = self._get_ssl_context(ssl_ca_cert, ssl_context)
def _get_ssl_context(self, ssl_ca_cert, ssl_context):
"""
Create an ssl context using the CA cert file or ssl context.
The CA cert is used first if it was passed as an option. If not,
then the supplied ssl context is used. If no ssl context was supplied,
None is returned.
"""
if ssl_ca_cert:
context = SSL.Context()
context.load_verify_locations(ssl_ca_cert)
context.set_verify(SSL.verify_none, -1)
return context
else:
return ssl_context
def create_https_connection(self, host, response_class = None):
connection = httplib.HTTPSConnection(host, self.ssl_context)
if response_class:
connection.response_class = response_class
return connection
def create_opener(self, *handlers):
return m2urllib2.build_opener(self.ssl_context, *handlers)
class SSLFactory:
def create_https_connection(self, host, response_class = None):
connection = httplib.HTTPSConnection(host)
if response_class:
connection.response_class = response_class
return connection
def create_opener(self, *handlers):
return urllib2.build_opener(*handlers)
def get_factory(ssl_ca_cert = None, ssl_context = None):
""" Return an SSLFactory, based on if M2Crypto is available. """
if have_m2crypto:
return M2SSLFactory(ssl_ca_cert, ssl_context)
else:
# Log here if someone provides the args but we don't use them.
if ssl_ca_cert or ssl_context:
if DEBUG:
DEBUG.warning("SSL arguments supplied, but M2Crypto is not available. "
"Using Python SSL.")
return SSLFactory()