diff options
-rw-r--r-- | ChangeLog | 246 | ||||
-rw-r--r-- | LICENSE | 458 | ||||
-rw-r--r-- | MANIFEST | 21 | ||||
-rw-r--r-- | PKG-INFO | 48 | ||||
-rw-r--r-- | README | 31 | ||||
-rw-r--r-- | TODO | 50 | ||||
-rw-r--r-- | makefile | 51 | ||||
-rw-r--r-- | packaging/python-urlgrabber.changes | 101 | ||||
-rw-r--r-- | packaging/python-urlgrabber.manifest | 5 | ||||
-rw-r--r-- | packaging/python-urlgrabber.spec | 61 | ||||
-rw-r--r-- | packaging/urlgrabber-HEAD.patch | 142 | ||||
-rw-r--r-- | packaging/urlgrabber-libproxy-httponly.patch | 83 | ||||
-rw-r--r-- | scripts/urlgrabber | 329 | ||||
-rw-r--r-- | setup.py | 45 | ||||
-rw-r--r-- | test/base_test_code.py | 33 | ||||
-rw-r--r-- | test/grabberperf.py | 137 | ||||
-rw-r--r-- | test/munittest.py | 934 | ||||
-rw-r--r-- | test/runtests.py | 60 | ||||
-rw-r--r-- | test/test_byterange.py | 162 | ||||
-rw-r--r-- | test/test_grabber.py | 607 | ||||
-rw-r--r-- | test/test_mirror.py | 275 | ||||
-rw-r--r-- | test/threading/batchgrabber.py | 110 | ||||
-rw-r--r-- | urlgrabber/__init__.py | 54 | ||||
-rw-r--r-- | urlgrabber/byterange.py | 463 | ||||
-rw-r--r-- | urlgrabber/grabber.py | 1730 | ||||
-rw-r--r-- | urlgrabber/mirror.py | 455 | ||||
-rw-r--r-- | urlgrabber/progress.py | 755 |
27 files changed, 7446 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..644fbdb --- /dev/null +++ b/ChangeLog @@ -0,0 +1,246 @@ +2009-09-25 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/__init__.py: bump version to 3.9.1 + +2009-09-25 Seth Vidal <skvidal@fedoraproject.org> + + * makefile: clean up everything in make clean + +2009-09-25 Seth Vidal <skvidal@fedoraproject.org> + + * test/runtests.py, test/test_grabber.py, test/test_keepalive.py, + urlgrabber/__init__.py, urlgrabber/byterange.py, + urlgrabber/grabber.py, urlgrabber/keepalive.py, + urlgrabber/mirror.py, urlgrabber/progress.py, + urlgrabber/sslfactory.py: cleanup all the old urlgrabber urllib code + that's not being used delete sslfactory and keepalive fix up the + unittests to match the existing code + +2009-09-24 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: update documentation for ssl options and + size/max_header_size options + +2009-09-23 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - fix the reget testcases (and regets in general) with the max size + check - make the errorcode more obvious when we go over the range - + obviously don't do the check if all of our max values are None (or + even 0 since that is a silly number for a Max) + +2009-09-22 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: handle endless-data problems safely: "A + malicious server could cause libcurl to download an infinite amount + of data, potentially causing all of memory or disk to be filled. + Setting the CURLOPT_MAXFILESIZE_LARGE option is not sufficient to + guard against this. Instead, the app should monitor the amount of + data received within the write or progress callback and abort once + the limit is reached." had to restructure a good bit of the error + handling to do this but it works for both endless headers and + endless content. + +2009-09-21 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: make sure the value we get back from the + parse150 and other calls is converted to an int before we make it + 'size' rhbug: #524705 + +2009-09-02 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: make file:// url not found msgs clearer and + hopefully fix a couple of ctrl-c issues. + +2009-08-27 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: make proxy=_none_ properly disable all + proxies as per the docs + +2009-08-14 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - add full contingent of ssl options: - client keys - client + certs - capath/cainfo - client key passwords - client key and + cert types - verifypeer/verifyhost - add a number of common errors + to do_perform() - when an error is unknown, and doesn't make sense + report complete pycurl error code - when the filename is '' and not + None and we're doing a urlgrab() try to open the file anyway + rather than silently swallowing the data into a StringIO and + discarding it. + +2009-08-13 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: add _to_utf8() method to pycurlfileobject + make sure postfield data is to_utf8'd before setting the option + otherwise pycurl is unhappy if the postfield data is a unicode + object instead of a string object. closes rh bug + https://bugzilla.redhat.com/show_bug.cgi?id=515797 + +2009-08-12 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: initial pass at setting more advanced ssl + options. verify peer and verify host work as expected. + +2009-08-07 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: keep from making tmpfiles all over /tmp on + any local file:// urlopen() by doing it in StringIO instead of + mkstemp(). Sort of fixes + https://bugzilla.redhat.com/show_bug.cgi?id=516178 + +2009-08-06 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - fix intrrupt handler and document why keyboardinterrupt is going + to be so weird in pycurl - disable signals and make sure we don't + handle/intercept any in the pycurl code. - set 'check_timestamp' + regets as NotImplemented. The work around is multiple connections. + it is possible but not immediately useful since, afaict, NOTHING + uses the check_timestamp regets. + +2009-08-05 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - make sure regets work when our filename is unicode - make sure we + are not resetting self.append = False when we don't need to + +2009-08-05 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - make sure we tell pycurl to get the filetime when downloading - + set a couple of options as 'True/False' instead of 1,0 - for + readability - make sure the option passed to timeout is an int - not + a string + +2009-08-04 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: missed setting the value from opts.timeout + - doesn't really HURT what will happen b/c if your connect takes + longer than 5minutes then you're SCREWED + +2009-08-04 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: handle timeouts more correctly (with the + exception) and set timeouts to be connect timeouts since libcurl + seems to actually honor timeouts - as opposed to urllib. closes rh + bug # 515497 + +2009-07-31 Seth Vidal <skvidal@fedoraproject.org> + + * ChangeLog, makefile, urlgrabber/__init__.py: changelog + release + date touchup + +2009-07-31 Seth Vidal <skvidal@fedoraproject.org> + + * makefile: add a few more things to be cleaned out + +2009-07-31 Seth Vidal <skvidal@fedoraproject.org> + + * ChangeLog: update changelog + +2009-07-31 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - make readlines() work for mirrorlists in yum (which probably + shouldn't be using it anyway) - do a do_grab() in _do_open() which + may or may not be a good idea - I could also make the _do_grab() + happen when someone attempts to hit a method beyond the file object + open + +2009-07-30 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - make basic posts work + +2009-07-30 Seth Vidal <skvidal@fedoraproject.org> + + * maint/git2cl: add git2cl + +2009-07-30 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: when I first started this I hacked + something into URLGrabberFileObject - this reverts that hack + +2009-07-30 Seth Vidal <skvidal@fedoraproject.org> + + * ChangeLog, maint/cvs2cl.pl, maint/usermap, test/runtests.py, + urlgrabber/__init__.py: - clean up some unused files - update the changelog - bump the + version - update the copyright in a couple of places + +2009-07-30 Seth Vidal <skvidal@fedoraproject.org> + + * MANIFEST.in, makefile: - make makefile work again without using cvs - add makefile to + MANIFEST.in + +2009-07-30 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - make simple/most proxies work - remove unnnecessary 'have_range' + check for pycyurl obj + +2009-07-29 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: - add range support - get rid of the .part file thing - it makes + range-regets harder than they need to be - make sure regets behave + +2009-07-29 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: implement throttle/bandwidth controls in + pycurl tested with the progress call back - seems to work very well + + +2009-07-29 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: get the content-length/size for ftp pkgs + too - steals parse150 from ftplib. Should work for A LOT of ftp + servers, but not all of them - add self.scheme for which protocol + we're using here. + +2009-07-29 James Antill <james@and.org> + + * urlgrabber/byterange.py: Import fix for ftp ports in old urilib + code (probably worthless now, but meh) + +2009-07-29 James Antill <james@and.org> + + * urlgrabber/progress.py: Import progress patches from Fedora. + These were done over a couple of years: . cleanup UI. . dynamic + terminal widths. . deal with serial console. . total download + stuff. + +2009-07-28 Seth Vidal <skvidal@fedoraproject.org> + + * test/runtests.py, urlgrabber/grabber.py: implement + PyCurlFileObject. This makes the default and forklifts all the code + to pycurl. This is not finished but is functional for a significant + number of the tests. things known to be broken: - proxies - http + POST - non-header-based byte-ranges - certain types of read + operations when downloading a file to memory instead of to a + filename + +2009-05-15 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: make it use *args instead of silly if + statements + +2009-05-15 Seth Vidal <skvidal@fedoraproject.org> + + * urlgrabber/grabber.py: modify urlgraberror so it has a url + attribute and includes the url in all error messages. + +2006-12-12 mstenner <mstenner> + + * urlgrabber/grabber.py: more debugging code to expose options + +2006-12-08 mstenner <mstenner> + + * scripts/urlgrabber, test/test_grabber.py, urlgrabber/grabber.py, + urlgrabber/keepalive.py: lots of changes... improved clarity of + cached objects, improved debugging and logging, more options to the + urlgrabber script. + +2006-12-07 mstenner <mstenner> + + * scripts/urlgrabber, urlgrabber/grabber.py: Minor doc updates and + error handling in grabber.py. Complete rewrite of the urlgrabber + script. + +2006-12-05 mstenner <mstenner> + + * Minor fix to make byteranges work with some servers. _do_grab now + only reads as much as it needs to, rather than reading until the + server sends EOF. + @@ -0,0 +1,458 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..dd51f15 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,21 @@ +ChangeLog +LICENSE +MANIFEST +README +TODO +makefile +setup.py +scripts/urlgrabber +test/base_test_code.py +test/grabberperf.py +test/munittest.py +test/runtests.py +test/test_byterange.py +test/test_grabber.py +test/test_mirror.py +test/threading/batchgrabber.py +urlgrabber/__init__.py +urlgrabber/byterange.py +urlgrabber/grabber.py +urlgrabber/mirror.py +urlgrabber/progress.py diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..1368b10 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,48 @@ +Metadata-Version: 1.0 +Name: urlgrabber +Version: 3.9.1 +Summary: A high-level cross-protocol url-grabber +Home-page: http://linux.duke.edu/projects/urlgrabber/ +Author: Michael D. Stenner, Ryan Tomayko +Author-email: mstenner@linux.duke.edu, skvidal@fedoraproject.org +License: LGPL +Description: A high-level cross-protocol url-grabber. + + Using urlgrabber, data can be fetched in three basic ways: + + urlgrab(url) copy the file to the local filesystem + urlopen(url) open the remote file and return a file object + (like urllib2.urlopen) + urlread(url) return the contents of the file as a string + + When using these functions (or methods), urlgrabber supports the + following features: + + * identical behavior for http://, ftp://, and file:// urls + * http keepalive - faster downloads of many files by using + only a single connection + * byte ranges - fetch only a portion of the file + * reget - for a urlgrab, resume a partial download + * progress meters - the ability to report download progress + automatically, even when using urlopen! + * throttling - restrict bandwidth usage + * retries - automatically retry a download if it fails. The + number of retries and failure types are configurable. + * authenticated server access for http and ftp + * proxy support - support for authenticated http and ftp proxies + * mirror groups - treat a list of mirrors as a single source, + automatically switching mirrors if there is a failure. + +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console +Classifier: Environment :: Web Environment +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL) +Classifier: Operating System :: POSIX +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python +Classifier: Topic :: Internet :: File Transfer Protocol (FTP) +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Software Development :: Libraries :: Python Modules @@ -0,0 +1,31 @@ +urlgrabber -- A high-level cross-protocol url-grabber + +INSTALLATION INSTRUCTIONS + +If you want to install urlgrabber on your system, simply open the package +and run: + + python setup.py install + +Take a look at the install options by doing + + python setup.py install --help + +With no arguments, distutils will add all modules to a 'urlgrabber' +package under your python site-packages directory. + +You can build rpms by running + + python setup.py bdist_rpm + +The rpms (both source and "binary") will be specific to the current +distrubution/version and may not be portable to others. This is +because they will be built for the currently installed python. + +keepalive.py and byterange.py are generic urllib2 extension modules and +can be used to add keepalive and range support to any urllib2 +application. + +As of version 2.9.x, urlgrabber is no longer tested with python +versions less than 2.2. It will probably work with 2.0, but will +almost certainly NOT work under prior python versions. @@ -0,0 +1,50 @@ +ALPHA 2: + + * web page + - better examples page + + * threading/batch + - (rt) propose an interface for threaded batch downloads + - (mds) design a new progress-meter interface for threaded + multi-file downloads + - (rt) look at CacheFTPHandler and its implications for batch mode + and byte-ranges/reget + + * progress meter stuff + - support for retrying a file (in a MirrorGroup, for example) + - failure support (done?) + - support for when we have less information (no sizes, etc) + - check compatibility with gui interfaces + - starting a download with some parts already read (with reget, + for example) + + * look at making the 'check_timestamp' reget mode work with ftp. + Currently, we NEVER get a timestamp back, so we can't compare. + We'll probably need to subclass/replace either the urllib2 FTP handler + or the ftplib FTP object (or both, but I doubt it). It may or may not + be worth it just for this one mode of reget. It fails safely - by + getting the entire file. + + * cache dns lookups -- for a possible approach, see + https://lists.dulug.duke.edu/pipermail/yum-devel/2004-March/000136.html + +Misc/Maybe: + + * BatchURLGrabber/BatchMirrorGroup for concurrent downloads and possibly to + handle forking into secure/setuid sandbox. + + * Consider adding a progress_meter implementation that can be used in + concurrent download situations (I have some ideas about this -mds) + + * Consider using CacheFTPHandler instead of FTPHandler in byterange.py. + CacheFTPHandler reuses connections but this may lead to problems with + ranges. I've tested CacheFTPHandler with ranges using vsftpd as a + server and everything works fine but this needs more exhaustive tests + or a fallback mechanism. Also, CacheFTPHandler breaks with multiple + threads. + + * Consider some statistics tracking so that urlgrabber can record the + speed/reliability of different servers. This could then be used by + the mirror code for choosing optimal servers (slick, eh?) + + * check SSL certs. This may require PyOpenSSL. diff --git a/makefile b/makefile new file mode 100644 index 0000000..caa0f9e --- /dev/null +++ b/makefile @@ -0,0 +1,51 @@ +PACKAGE = urlgrabber +RM = /bin/rm -rf +GIT = /usr/bin/git +WEBHOST = login.dulug.duke.edu +WEBPATH = /home/groups/urlgrabber/web/download +PYTHON = python +PY_MODULE = $(PACKAGE) +SCM_MODULE = $(PACKAGE) +CLEANFILES = MANIFEST *~ build dist export release daily reference nonexistent_file ChangeLog.bak \ + *.pyc urlgrabber/*.pyc scripts/*.pyc test/*.pyc test/nonexistent_file \ + test/reference test/reference.part urlgrabber/*~ +############################################################################## +VERSION = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__version__') +DATE = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__date__') +SCM_TAG = release-$(shell echo $(VERSION) | sed -e 's/\./_/g') +PYTHON22 = $(shell /usr/bin/which python2.2 2>/dev/null) +PYTHON23 = $(shell /usr/bin/which python2.3 2>/dev/null) +PYTHON24 = $(shell /usr/bin/which python2.4 2>/dev/null) +PYTHON25 = $(shell /usr/bin/which python2.5 2>/dev/null) +TESTPYTHONS = $(PYTHON22) $(PYTHON23) $(PYTHON24) $(PYTHON25) +############################################################################## + +default: + @echo TARGETS: changelog release clean test + +changelog: + $(GIT) log --since=2006-12-01 --pretty --numstat --summary | maint/git2cl > ChangeLog + +# NOTE: do --manifest-only first even though we're about to force it. The +# former ensures that MANIFEST exists (touch would also do the trick). If +# the file 'MANIFEST' doesn't exist, then it won't be included the next time +# it's built from MANIFEST.in +release: FORCE pre-release-test + @dir=$$PWD; $(PYTHON) setup.py sdist --manifest-only + @dir=$$PWD; $(PYTHON) setup.py sdist --force-manifest + @echo "The archive is in dist/${PACKAGE}-$(VERSION).tar.gz" + +pre-release-test: + @echo "You should make sure you've updated the changelog" + @echo "version = $(VERSION), date = $(DATE), tag = $(SCM_TAG)" + test $(DATE) = `date +'%Y/%m/%d'` # verify release date is set to today + +clean: + $(RM) $(CLEANFILES) + +test: FORCE + @export PYTHONPATH=.; \ + $(PYTHON) test/runtests.py -v 1; \ + +FORCE: + diff --git a/packaging/python-urlgrabber.changes b/packaging/python-urlgrabber.changes new file mode 100644 index 0000000..e81b8ac --- /dev/null +++ b/packaging/python-urlgrabber.changes @@ -0,0 +1,101 @@ +* Tue Sep 06 2011 William Douglas <william.douglas@intel.com> - 3.9.1 +- Fix missing prefix + +* Fri Mar 11 2011 Yan Li <yan.i.li@intel.com> - 3.9.1 +- When libproxy fails, fall back to environment variables (BMC#13757) + +* Sat Jan 30 2010 Jian-feng Ding <jian-feng.ding@intel.com> 3.9.1 +- Upgrade to 3.9.1 and enable spectacle + Dropped all old patches except David's libproxy one + Integreted the updated version of libproxy patch, found in: + https://bugzilla.redhat.com/show_bug.cgi?id=542224 + +* Sat Dec 26 2009 David Woodhouse <David.Woodhouse@intel.com> 3.1.0 +- Use libproxy to find proxy information + +* Sun Apr 12 2009 Peter Zhu<peter.j.zhu@intel.com> 3.1.0 +- Add dependency to m2crypto so that it can works with https connection fixed bug #1289 + + +* Fri Feb 20 2009 Zhu Yanhai<yanhai.zhu@intel.com> 3.1.0 +- Correct SOURCE +- Version update to 3.1.0 + +* Mon Apr 7 2008 James Antill <james@fedoraproject.org> 3.0.0-6 +- Fix the ftp byterange port problem: +- Resolves: bug#419241 +- Fixup the progress UI: +- add function for total progress +- add total progress percentagee current download line +- add rate to current download line +- use dead space when finished downloading +- don't confuse download rate on regets. + +* Sat Mar 15 2008 Robert Scheck <robert@fedoraproject.org> 3.0.0-5 +- Make sure, that *.egg-info is catched up during build + +* Mon Dec 3 2007 Jeremy Katz <katzj@redhat.com> - 3.0.0-4 +- Ensure fds are closed on exceptions (markmc, #404211) + +* Wed Oct 10 2007 Jeremy Katz <katzj@redhat.com> - 3.0.0-3 +- fix type checking of strings to also include unicode strings; fixes + regets from yum (#235618) + +* Mon Aug 27 2007 Jeremy Katz <katzj@redhat.com> - 3.0.0-2 +- fixes for package review (#226347) + +* Thu May 31 2007 Jeremy Katz <katzj@redhat.com> - 3.0.0-1 +- update to 3.0.0 + +* Wed Dec 6 2006 Jeremy Katz <katzj@redhat.com> - 2.9.9-5 +- rebuild for python 2.5 + +* Wed Dec 6 2006 Jeremy Katz <katzj@redhat.com> - 2.9.9-4 +- fix keepalive (#218268) + +* Sat Nov 11 2006 Florian La Roche <laroche@redhat.com> +- add version/release to "Provides: urlgrabber" + +* Mon Jul 17 2006 James Bowes <jbowes@redhat.com> - 2.9.9-2 +- Add support for byte ranges and keepalive over HTTPS + +* Wed Jul 12 2006 Jesse Keating <jkeating@redhat.com> - 2.9.9-1.1 +- rebuild + +* Tue May 16 2006 Jeremy Katz <katzj@redhat.com> - 2.9.9-1 +- update to 2.9.9 + +* Tue Mar 14 2006 Jeremy Katz <katzj@redhat.com> - 2.9.8-2 +- catch read errors so they trigger the failure callback. helps catch bad cds + +* Wed Feb 22 2006 Jeremy Katz <katzj@redhat.com> - 2.9.8-1 +- update to new version fixing progress bars in yum on regets + +* Fri Dec 09 2005 Jesse Keating <jkeating@redhat.com> +- rebuilt + +* Wed Sep 21 2005 Jeremy Katz <katzj@redhat.com> - 2.9.6-4 +- don't use --record and list files by hand so that we don't miss + directories (#158480) + +* Wed Sep 14 2005 Jeremy Katz <katzj@redhat.com> - 2.9.6-3 +- add directory to file list (#168261) + +* Fri Jun 03 2005 Phil Knirsch <pknirsch@redhat.com> 2.9.6-2 +- Fixed the reget method to actually work correctly (skip completely transfered + files, etc) + +* Tue Mar 8 2005 Jeremy Katz <katzj@redhat.com> - 2.9.6-1 +- update to 2.9.6 + +* Mon Mar 7 2005 Jeremy Katz <katzj@redhat.com> - 2.9.5-1 +- import into dist +- make the description less of a book + +* Mon Mar 7 2005 Seth Vidal <skvidal@phy.duke.edu> 2.9.5-0 +- 2.9.5 + +* Thu Feb 24 2005 Seth Vidal <skvidal@phy.duke.edu> 2.9.3-0 +- first package for fc3 +- named python-urlgrabber for naming guideline compliance + diff --git a/packaging/python-urlgrabber.manifest b/packaging/python-urlgrabber.manifest new file mode 100644 index 0000000..017d22d --- /dev/null +++ b/packaging/python-urlgrabber.manifest @@ -0,0 +1,5 @@ +<manifest> + <request> + <domain name="_"/> + </request> +</manifest> diff --git a/packaging/python-urlgrabber.spec b/packaging/python-urlgrabber.spec new file mode 100644 index 0000000..92a1516 --- /dev/null +++ b/packaging/python-urlgrabber.spec @@ -0,0 +1,61 @@ + +%{!?python_sitearch: %define python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)")} +Name: python-urlgrabber +Summary: A high-level cross-protocol url-grabber +Version: 3.9.1 +Release: 1 +Group: Development/Libraries +License: LGPLv2+ +BuildArch: noarch +URL: http://urlgrabber.baseurl.org/ +Source0: %{name}-%{version}.tar.gz +Source1001: packaging/python-urlgrabber.manifest +Patch0: urlgrabber-HEAD.patch +Requires: python-pycurl +#Requires: m2crypto +#Requires: libproxy-python +BuildRequires: python-devel +BuildRequires: python-pycurl +Provides: urlgrabber = %{version}-%{release} + +BuildRoot: %{_tmppath}/%{name}-%{version}-build + +%description +A high-level cross-protocol url-grabber for python supporting HTTP, FTP +and file locations. Features include keepalive, byte ranges, throttling, +authentication, proxies and more. + + + + +%prep +%setup -q +%patch0 -p1 + +%build +cp %{SOURCE1001} . + +CFLAGS="$RPM_OPT_FLAGS" %{__python} setup.py build + +%install +rm -rf %{buildroot} +%{__python} setup.py install --root=%{buildroot} -O1 --prefix=%{_prefix} + +rm -rf $RPM_BUILD_ROOT/%{_docdir}/urlgrabber-%{version} + +%clean +rm -rf %{buildroot} + + + + + + +%files +%manifest python-urlgrabber.manifest +%defattr(-,root,root,-) +%doc ChangeLog LICENSE README TODO +%{python_sitelib}/urlgrabber* +%{_bindir}/urlgrabber + + diff --git a/packaging/urlgrabber-HEAD.patch b/packaging/urlgrabber-HEAD.patch new file mode 100644 index 0000000..90180d2 --- /dev/null +++ b/packaging/urlgrabber-HEAD.patch @@ -0,0 +1,142 @@ +diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py +index e090e90..a26880c 100644 +--- a/urlgrabber/grabber.py ++++ b/urlgrabber/grabber.py +@@ -439,6 +439,12 @@ try: + except: + __version__ = '???' + ++try: ++ # this part isn't going to do much - need to talk to gettext ++ from i18n import _ ++except ImportError, msg: ++ def _(st): return st ++ + ######################################################################## + # functions for debugging output. These functions are here because they + # are also part of the module initialization. +@@ -1052,7 +1058,8 @@ class PyCurlFileObject(): + self._reget_length = 0 + self._prog_running = False + self._error = (None, None) +- self.size = None ++ self.size = 0 ++ self._hdr_ended = False + self._do_open() + + +@@ -1085,9 +1092,14 @@ class PyCurlFileObject(): + return -1 + + def _hdr_retrieve(self, buf): ++ if self._hdr_ended: ++ self._hdr_dump = '' ++ self.size = 0 ++ self._hdr_ended = False ++ + if self._over_max_size(cur=len(self._hdr_dump), + max_size=self.opts.max_header_size): +- return -1 ++ return -1 + try: + self._hdr_dump += buf + # we have to get the size before we do the progress obj start +@@ -1104,7 +1116,17 @@ class PyCurlFileObject(): + s = parse150(buf) + if s: + self.size = int(s) +- ++ ++ if buf.lower().find('location') != -1: ++ location = ':'.join(buf.split(':')[1:]) ++ location = location.strip() ++ self.scheme = urlparse.urlsplit(location)[0] ++ self.url = location ++ ++ if len(self._hdr_dump) != 0 and buf == '\r\n': ++ self._hdr_ended = True ++ if DEBUG: DEBUG.info('header ended:') ++ + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT +@@ -1136,6 +1158,7 @@ class PyCurlFileObject(): + self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) + self.curl_obj.setopt(pycurl.FAILONERROR, True) + self.curl_obj.setopt(pycurl.OPT_FILETIME, True) ++ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) + + if DEBUG: + self.curl_obj.setopt(pycurl.VERBOSE, True) +@@ -1291,7 +1314,12 @@ class PyCurlFileObject(): + raise err + + elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it +- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) ++ if self.scheme in ['http', 'https']: ++ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) ++ elif self.scheme in ['ftp']: ++ msg = 'FTP Error %s : %s ' % (self.http_code, self.url) ++ else: ++ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme) + else: + msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) + code = errcode +@@ -1299,6 +1327,12 @@ class PyCurlFileObject(): + err.code = code + err.exception = e + raise err ++ else: ++ if self._error[1]: ++ msg = self._error[1] ++ err = URLGRabError(14, msg) ++ err.url = self.url ++ raise err + + def _do_open(self): + self.curl_obj = _curl_cache +@@ -1532,11 +1566,14 @@ class PyCurlFileObject(): + def _over_max_size(self, cur, max_size=None): + + if not max_size: +- max_size = self.size +- if self.opts.size: # if we set an opts size use that, no matter what +- max_size = self.opts.size ++ if not self.opts.size: ++ max_size = self.size ++ else: ++ max_size = self.opts.size ++ + if not max_size: return False # if we have None for all of the Max then this is dumb +- if cur > max_size + max_size*.10: ++ ++ if cur > int(float(max_size) * 1.10): + + msg = _("Downloaded more than max size for %s: %s > %s") \ + % (self.url, cur, max_size) +@@ -1582,7 +1619,11 @@ class PyCurlFileObject(): + self.opts.progress_obj.end(self._amount_read) + self.fo.close() + +- ++ def geturl(self): ++ """ Provide the geturl() method, used to be got from ++ urllib.addinfourl, via. urllib.URLopener.* """ ++ return self.url ++ + _curl_cache = pycurl.Curl() # make one and reuse it over and over and over + + +diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py +index dd07c6a..45eb248 100644 +--- a/urlgrabber/progress.py ++++ b/urlgrabber/progress.py +@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0): + if seconds is None or seconds < 0: + if use_hours: return '--:--:--' + else: return '--:--' ++ elif seconds == float('inf'): ++ return 'Infinite' + else: + seconds = int(seconds) + minutes = seconds / 60 diff --git a/packaging/urlgrabber-libproxy-httponly.patch b/packaging/urlgrabber-libproxy-httponly.patch new file mode 100644 index 0000000..ffed3ba --- /dev/null +++ b/packaging/urlgrabber-libproxy-httponly.patch @@ -0,0 +1,83 @@ +From 2dc04b08536ad0a17147fc91eacb3dc8e87a755b Mon Sep 17 00:00:00 2001 +Message-Id: <2dc04b08536ad0a17147fc91eacb3dc8e87a755b.1299826695.git.yan.i.li@intel.com> +From: Yan Li <yan.i.li@intel.com> +Date: Fri, 11 Mar 2011 14:49:28 +0800 +Subject: [PATCH] libproxy support + +Based on David Woodhouse <dwmw2@infradead.org>'s work. + +Signed-off-by: Yan Li <yan.i.li@intel.com> +--- + urlgrabber/grabber.py | 35 ++++++++++++++++++++++++++++++++--- + 1 files changed, 32 insertions(+), 3 deletions(-) + +diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py +index e090e90..a4e3e04 100644 +--- a/urlgrabber/grabber.py ++++ b/urlgrabber/grabber.py +@@ -143,8 +143,17 @@ GENERAL ARGUMENTS (kwargs) + note that proxy authentication information may be provided using + normal URL constructs: + proxies={ 'http' : 'http://user:host@foo:3128' } +- Lastly, if proxies is None, the default environment settings will +- be used. ++ If proxies is None, the proxy_factory (described below) will be used. ++ ++ proxy_factory = libproxy.ProxyFactory() ++ ++ a libproxy ProxyFactory object. This is initialised to a default ++ global ProxyFactory if libproxy is installed, but can be ++ overridden to None to disable libproxy, or indeed to anything with ++ a getProxies() method that takes a URL and returns a list of ++ potential proxies. The proxy_factory is is only used if the ++ proxies dictionary is not set. If both proxies and proxy_factory ++ are None, the default environment variable will be used. + + prefix = None + +@@ -439,6 +448,12 @@ try: + except: + __version__ = '???' + ++try: ++ import libproxy ++ _grabber_proxy_factory = libproxy.ProxyFactory() ++except: ++ _grabber_proxy_factory = None ++ + ######################################################################## + # functions for debugging output. These functions are here because they + # are also part of the module initialization. +@@ -802,6 +817,7 @@ class URLGrabberOptions: + self.user_agent = 'urlgrabber/%s' % __version__ + self.keepalive = 1 + self.proxies = None ++ self.proxy_factory = _grabber_proxy_factory + self.reget = None + self.failure_callback = None + self.interrupt_callback = None +@@ -1202,7 +1218,20 @@ class PyCurlFileObject(): + else: + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) +- ++ elif opts.proxy_factory: ++ try: ++ proxies = opts.proxy_factory.getProxies(self.url); ++ for proxy in proxies: ++ if proxy.startswith('http://'): ++ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % \ ++ (proxy, self.url)) ++ self.curl_obj.setopt(pycurl.PROXY, proxy) ++ break ++ except: ++ # libproxy may fail, and in that case we just fall ++ # back to next proxy supplier (environment variables) ++ opts.proxy_factory = None ++ + # FIXME username/password/auth settings + + #posts - simple - expects the fields as they are +-- +1.7.4.1 + diff --git a/scripts/urlgrabber b/scripts/urlgrabber new file mode 100644 index 0000000..518e512 --- /dev/null +++ b/scripts/urlgrabber @@ -0,0 +1,329 @@ +#!/usr/bin/python -t + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko + +"""NAME + urlgrabber - a simple client for the urlgrabber python package + +DESCRIPTION + This is a thin client for the urlgrabber python package. It is + provided mainly for helping debug the python package. It provides + low-level access to most urlgrabber features from the shell. + + There are two types of options available for this program. They are + 'client options' and 'module options'. Client options apply + specifically to the behavior of this client, whereas module options + are built-in options to the urlgrabber module. Both of these are + avaible from the client command line, but they're documented a + little differently. Client options are documented here, and module + options are documented through the '--help <option>' syntax. + +CLIENT OPTIONS + -h, --help + + In its short form, (-h) this provides a short usage discription. + In its long form, it processes the remaining command line as help + topics. Legal topics are: + doc this info + options a list of module options + <option(s)> documentation for a given module option + all documentation for all module options + + Examples: + urlgrabber --help options + urlgrabber --help copy_local + + -o FILE + + By default, downloaded data will be written to a file named using + the basename of the url. For example, + 'http://foo.com/index.html' will be written to 'index.html'. You + can override this for your convenience or when necessary for urls + like 'http://foo.com/' + + -O + + Print the local name of each downloaded file to STDOUT. This is + helpful when not using the '-o' option, but is particularly + useful when using --copy_local=0 (the default) on local files + because local files will not be copied and the output filename + will not be the same as that provided with '-o'. + + --repeat=N + + Grab each url N times. This is mostly for debugging keepalive. + + -p, --progress + + Use the default text-based progress meter. + + -v, --verbose=N + + Increment the verbosity level with each use of '-v' or set it + directly with --verbose=N. Currently, distinct levels are 0-2. + The default is 0. + + -d SPEC, --debug=SPEC + + Turn on internal urlgrabber debugging. This is equivalent (but + overrides) running with the environment variable: + URLGRABBER_DEBUG=SPEC + SPEC can be of the form LEVEL,FILENAME, where + LEVEL can be string (DEBUG, WARN, etc) or number. + FILENAME can be the name of a file or "-" for STDOUT. The + default is STDERR. Example: -d1,- logs everything to STDOUT. + Note: this only works for python > 2.3 because it requires the + logging package. + + -D + + A convenience alias for: --verbose=3 --progress --debug=INFO,- + + --profile + + Profile the actual fetching and print the results. + +""" + +MAINHELP = """usage: urlgrabber [options] <url> +urlgrabber - a simple client for the urlgrabber python package + +options: + -h, --help print this message + --help doc print basic intro and documentation + --help options list available options to the grabber module + --help <option> print documentation for a module option + --help all print documentation for all module options + --<option>=VAL specify a module option. VAL must be a python value, + including quotes in the case of strings. + e.g. --user_agent='"foobar/2.0"' + + -o FILE write output to FILE, otherwise the basename of the + url will be used + -O print the names of saved files to STDOUT + --repeat=N grab each URL N times (mostly for debugging keepalive) + -p, --progress use the default text progress meter + -v increment the verbosity level (defaults to 0) + --verbose=N set the verbosity level to N + -d SPEC, --debug=SPEC + turn on urlgrabber module debugging with + SPEC=LEVEL,FILENAME. e.g. -d 1,debug.txt + -D a convenience option equivalent to: + --verbose=3 --progress --debug=INFO,- + --profile profile the actual fetching and print the results + """ + +# $Id: urlgrabber,v 1.7 2006/12/08 00:14:16 mstenner Exp $ + +import sys +import getopt +import re + +import urlgrabber.grabber +from urlgrabber.grabber import URLGrabber, URLGrabberOptions, URLGrabError + +class client_options: + def __init__(self): + self.ug_options, self.ug_defaults = self.get_ug_options() + self.process_command_line() + + def get_ug_options(self): + ugo = URLGrabberOptions() + ug_options = ['copy_local', 'keepalive', 'prefix', 'reget', + 'data', 'quote', 'throttle', 'bandwidth', + 'proxies', 'retry', 'retrycodes', + 'range', 'user_agent', + 'http_headers', 'ftp_headers', + 'ssl_ca_cert', 'ssl_context', + 'text', 'close_connection', + 'cache_openers','timeout'] + options_exclude = ['delegate', 'interrupt_callback', + 'failure_callback', 'urlparser', 'opener', + 'checkfunc', 'progress_obj'] + for k in ugo.__dict__.keys(): + if (k not in ug_options) and (k not in options_exclude): + ug_options.append(k) + #print k + ug_defaults = {} + for k in list(ug_options): + try: + ug_defaults[k] = repr(getattr(ugo, k)) + except AttributeError: + ug_options.remove(k) + return ug_options, ug_defaults + + def process_command_line(self): + short_options = 'vd:hoOpD' + long_options = ['profile', 'repeat=', 'verbose=', + 'debug=', 'help', 'progress'] + ug_long = [ o + '=' for o in self.ug_options ] + optlist, args = getopt.getopt(sys.argv[1:], short_options, + long_options + ug_long) + self.verbose = 0 + self.debug = None + self.outputfile = None + self.localfile = 0 + self.repeat = 1 + self.progress = 0 + self.profile = 0 + self.ugops = {} + self.args = args + + ug_dash = [ '--' + o for o in self.ug_options ] + if not args: self.help(args) + for (o, v) in optlist: + if o == '--help' or o == '-h': self.help(args) + if o == '--verbose': self.verbose = v + if o == '-v': self.verbose += 1 + if o == '-o': self.outputfile = v + if o == '-p' or o == '--progress': self.progress = 1 + if o == '-d' or o == '--debug': self.debug = v + if o == '--profile': self.profile = 1 + if o == '-O': self.localfile = 1 + if o == '--repeat': + try: + self.repeat = int(v) + if self.repeat < 1: raise ValueError() + except ValueError: + print 'ERROR: repeat value must be an int >= 1' + sys.exit(1) + if o == '-D': + self.verbose = 3 + self.debug = "INFO,-" + self.progress = 1 + if o in ug_dash: + try: + val = eval(v) + except Exception, e: + print "error processing option value: %s" % v + print e + sys.exit(1) + else: + self.ugops[o[2:]] = val + + if len(self.args) > 1 and self.outputfile is not None: + print "ERROR: cannot use -o when grabbing multiple files" + sys.exit(1) + + def help(self, args): + if not args: + print MAINHELP + else: + for a in args: + m = getattr(self, 'help_'+a, None) + if m is not None: + m() + elif a in self.ug_options: + self.help_ug_option(a) + else: + print 'ERROR: no help on command "%s"' % a + sys.exit(0) + + def help_doc(self): + print __doc__ + + def help_options(self): + width = max(map(len, self.ug_options)) + format = ' %-' + str(width) + 's = %s' + hformat = ' %-' + str(width) + 's %s' + print hformat % ('OPTION', 'DEFAULT') + print '-'*(width + 20) + for k in self.ug_options: + print format % (k, self.ug_defaults[k]) + + def help_all(self): + for k in self.ug_options: + self.help_ug_option(k) + + def help_ug_option(self, option): + help = '' + m = re.search(r'^( '+option+'.*?)\s*^ {,2}\S', + urlgrabber.grabber.__doc__, re.M|re.S) + if m: + print m.group(1) + else: + print ' %s: no help found for this option' % option + print '' + +class ugclient: + def __init__(self): + op = client_options() + self.op = op + if op.verbose >= 2 and op.ugops: + print "Module Options:" + width = max(map(len, op.ugops.keys())) + format = " %-" + str(width) + "s = %s" + for k, v in op.ugops.items(): + print format % (k, repr(v)) + + if op.debug: + self.set_debug_logger(op.debug) + if hasattr(urlgrabber.grabber, '_log_package_state'): + urlgrabber.grabber._log_package_state() + + kwargs = dict(op.ugops) + if op.progress: + from urlgrabber.progress import text_progress_meter + kwargs['progress_obj'] = text_progress_meter() + + self.g = URLGrabber(**kwargs) + + def run(self): + for url in self.op.args: + if self.op.verbose: print 'grabbing: %s' % url + try: + for i in range(0, self.op.repeat): + f = self.g.urlgrab(url, self.op.outputfile) + if self.op.localfile: print f + except URLGrabError, e: + print e + + def set_debug_logger(self, dbspec): + try: + dbinfo = dbspec.split(',') + import logging + level = logging._levelNames.get(dbinfo[0], None) + if level is None: level = int(dbinfo[0]) + if level < 1: raise ValueError() + + formatter = logging.Formatter('%(asctime)s %(message)s') + if len(dbinfo) > 1: filename = dbinfo[1] + else: filename = '' + if filename == '': handler = logging.StreamHandler(sys.stderr) + elif filename == '-': handler = logging.StreamHandler(sys.stdout) + else: handler = logging.FileHandler(filename) + handler.setFormatter(formatter) + DBOBJ = logging.getLogger('urlgrabber') + DBOBJ.addHandler(handler) + DBOBJ.setLevel(level) + except (KeyError, ImportError, ValueError): + DBOBJ = None + urlgrabber.grabber.set_logger(DBOBJ) + +if __name__ == '__main__': + ugc = ugclient() + if ugc.op.profile: + import profile + import pstats + prof = profile.Profile() + prof.run('ugc.run()') + pstats.Stats(prof).strip_dirs().sort_stats('cumulative').print_stats() + else: + ugc.run() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d0b87b8 --- /dev/null +++ b/setup.py @@ -0,0 +1,45 @@ +# urlgrabber distutils setup +import re as _re +import urlgrabber as _urlgrabber + +name = "urlgrabber" +description = "A high-level cross-protocol url-grabber" +long_description = _urlgrabber.__doc__ +license = "LGPL" +version = _urlgrabber.__version__ +_authors = _re.split(r',\s+', _urlgrabber.__author__) +author = ', '.join([_re.sub(r'\s+<.*', r'', _) for _ in _authors]) +author_email = ', '.join([_re.sub(r'(^.*<)|(>.*$)', r'', _) for _ in _authors]) +url = _urlgrabber.__url__ + +packages = ['urlgrabber'] +package_dir = {'urlgrabber':'urlgrabber'} +scripts = ['scripts/urlgrabber'] +data_files = [('share/doc/' + name + '-' + version, + ['README','LICENSE', 'TODO', 'ChangeLog'])] +options = { 'clean' : { 'all' : 1 } } +classifiers = [ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Environment :: Web Environment', + 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', + 'Operating System :: POSIX', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python', + 'Topic :: Internet :: File Transfer Protocol (FTP)', + 'Topic :: Internet :: WWW/HTTP', + 'Topic :: Software Development :: Libraries :: Python Modules' + ] + +# load up distutils +if __name__ == '__main__': + config = globals().copy() + keys = config.keys() + for k in keys: + #print '%-20s -> %s' % (k, config[k]) + if k.startswith('_'): del config[k] + + from distutils.core import setup + setup(**config) diff --git a/test/base_test_code.py b/test/base_test_code.py new file mode 100644 index 0000000..50c6348 --- /dev/null +++ b/test/base_test_code.py @@ -0,0 +1,33 @@ +from munittest import * + +base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/' +base_ftp = 'ftp://localhost/test/' + +# set to a proftp server only. we're working around a couple of +# bugs in their implementation in byterange.py. +base_proftp = 'ftp://localhost/test/' + +reference_data = ''.join( [str(i)+'\n' for i in range(20000) ] ) +ref_http = base_http + 'reference' +ref_ftp = base_ftp + 'reference' +ref_proftp = base_proftp + 'reference' +short_reference_data = ' '.join( [str(i) for i in range(10) ] ) +short_ref_http = base_http + 'short_reference' +short_ref_ftp = base_ftp + 'short_reference' + +ref_200 = ref_http +ref_404 = base_http + 'nonexistent_file' +ref_403 = base_http + 'mirror/broken/' + +base_mirror_url = base_http + 'mirror/' +good_mirrors = ['m1', 'm2', 'm3'] +mirror_files = ['test1.txt', 'test2.txt'] +bad_mirrors = ['broken'] +bad_mirror_files = ['broken.txt'] + +proxy_proto = 'http' +proxy_host = 'localhost' +proxy_port = 8888 +proxy_user = 'proxyuser' +good_proxy_pass = 'proxypass' +bad_proxy_pass = 'badproxypass' diff --git a/test/grabberperf.py b/test/grabberperf.py new file mode 100644 index 0000000..820da2c --- /dev/null +++ b/test/grabberperf.py @@ -0,0 +1,137 @@ +#!/usr/bin/python -t + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +import sys +import os +from os.path import dirname, join as joinpath +import tempfile +import time + +import urlgrabber.grabber as grabber +from urlgrabber.grabber import URLGrabber, urlgrab, urlopen, urlread +from urlgrabber.progress import text_progress_meter + +tempsrc = '/tmp/ug-test-src' +tempdst = '/tmp/ug-test-dst' + +# this isn't used but forces a proxy handler to be +# added when creating the urllib2 opener. +proxies = { 'http' : 'http://localhost' } +DEBUG=0 + +def main(): + speedtest(1024) # 1KB + speedtest(10 * 1024) # 10 KB + speedtest(100 * 1024) # 100 KB + speedtest(1000 * 1024) # 1,000 KB (almost 1MB) + #speedtest(10000 * 1024) # 10,000 KB (almost 10MB) + # remove temp files + os.unlink(tempsrc) + os.unlink(tempdst) + +def setuptemp(size): + if DEBUG: print 'writing %d KB to temporary file (%s).' % (size / 1024, tempsrc) + file = open(tempsrc, 'w', 1024) + chars = '0123456789' + for i in range(size): + file.write(chars[i % 10]) + file.flush() + file.close() + +def speedtest(size): + setuptemp(size) + full_times = [] + raw_times = [] + none_times = [] + throttle = 2**40 # throttle to 1 TB/s :) + + try: + from urlgrabber.progress import text_progress_meter + except ImportError, e: + tpm = None + print 'not using progress meter' + else: + tpm = text_progress_meter(fo=open('/dev/null', 'w')) + + # to address concerns that the overhead from the progress meter + # and throttling slow things down, we do this little test. + # + # using this test, you get the FULL overhead of the progress + # meter and throttling, without the benefit: the meter is directed + # to /dev/null and the throttle bandwidth is set EXTREMELY high. + # + # note: it _is_ even slower to direct the progress meter to a real + # tty or file, but I'm just interested in the overhead from _this_ + # module. + + # get it nicely cached before we start comparing + if DEBUG: print 'pre-caching' + for i in range(100): + urlgrab(tempsrc, tempdst, copy_local=1, throttle=None, proxies=proxies) + + if DEBUG: print 'running speed test.' + reps = 500 + for i in range(reps): + if DEBUG: + print '\r%4i/%-4i' % (i+1, reps), + sys.stdout.flush() + t = time.time() + urlgrab(tempsrc, tempdst, + copy_local=1, progress_obj=tpm, + throttle=throttle, proxies=proxies) + full_times.append(1000 * (time.time() - t)) + + t = time.time() + urlgrab(tempsrc, tempdst, + copy_local=1, progress_obj=None, + throttle=None, proxies=proxies) + raw_times.append(1000 * (time.time() - t)) + + t = time.time() + in_fo = open(tempsrc) + out_fo = open(tempdst, 'wb') + while 1: + s = in_fo.read(1024 * 8) + if not s: break + out_fo.write(s) + in_fo.close() + out_fo.close() + none_times.append(1000 * (time.time() - t)) + + if DEBUG: print '\r' + + print "%d KB Results:" % (size / 1024) + print_result('full', full_times) + print_result('raw', raw_times) + print_result('none', none_times) + +def print_result(label, result_list): + format = '[%4s] mean: %6.3f ms, median: %6.3f ms, ' \ + 'min: %6.3f ms, max: %6.3f ms' + result_list.sort() + mean = 0.0 + for i in result_list: mean += i + mean = mean/len(result_list) + median = result_list[int(len(result_list)/2)] + print format % (label, mean, median, result_list[0], result_list[-1]) + +if __name__ == '__main__': + main() diff --git a/test/munittest.py b/test/munittest.py new file mode 100644 index 0000000..96230b8 --- /dev/null +++ b/test/munittest.py @@ -0,0 +1,934 @@ +#!/usr/bin/env python +""" +This is a modified version of the unittest module has been modified by +Michael D. Stenner from Steve Purcell's version (revision 1.46, as +distributed with python 2.3.3) in the following ways: + + * the text formatting has been made much prettier by printing "nested" + test suites + * the test resulte "skip" has been added for skipping tests. A test + can call any of the .skip() .skipUnless(<test>), or .skipIf(<test>) + methods from within the test method or the setUp method. + * all attributes originally named with leading "__" have been changed + to a single "_". This makes subclassing much easier. + +COMPATIBILITY + + It should be possible to drop this in as replacement for the + standard unittest module simply by doing: + + import munittest as unittest + + In fact, the reverse is ALMOST true. Test code written for this + module very nearly runs perfectly with the standard unittest module. + Exceptions are: + + * The .skip() methods will obviously not work on the standard + unittest. However, they will ERROR out and the error message will + complain about missing .skip() attributes, so it will be obvious and + will have the same effect as skipping. + + * the .setDescription method (or description argument) for + TestSuite will not work. However, setting the .description + attribute on a standard TestSuite instance does no harm, so if + need to set them manually (you're not satisfied with the + doc-string route) and you WANT to be compatible both ways, do + that :) + +DESCRIPTIONS + + Names for suites in the pretty formatting are (like the test + functions) slurped from the doc-strings of the corresponding object, + or taken from the names of those objects. This applies to both + TestCase-derived classes, and modules. Also, the TestSuite class + description can be set manually in a number of ways (all of which + achieve the same result): + + suite = TestSuite(test_list, 'this is the description') + suite.setDescription('this is the description') + suite.description = 'this is the description' + +Michael D. Stenner <mstenner@linux.duke.edu> +2004/03/18 +v0.1 +=========================================================================== +The original doc-string for this module follows: +=========================================================================== +Python unit testing framework, based on Erich Gamma's JUnit and Kent Beck's +Smalltalk testing framework. + +This module contains the core framework classes that form the basis of +specific test cases and suites (TestCase, TestSuite etc.), and also a +text-based utility class for running the tests and reporting the results + (TextTestRunner). + +Simple usage: + + import unittest + + class IntegerArithmenticTestCase(unittest.TestCase): + def testAdd(self): ## test method names begin 'test*' + self.assertEquals((1 + 2), 3) + self.assertEquals(0 + 1, 1) + def testMultiply(self): + self.assertEquals((0 * 10), 0) + self.assertEquals((5 * 8), 40) + + if __name__ == '__main__': + unittest.main() + +Further information is available in the bundled documentation, and from + + http://pyunit.sourceforge.net/ + +Copyright (c) 1999, 2000, 2001 Steve Purcell +This module is free software, and you may redistribute it and/or modify +it under the same terms as Python itself, so long as this copyright message +and disclaimer are retained in their original form. + +IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, +SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF +THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + +THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, +AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, +SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +""" + +# $Id: munittest.py,v 1.2 2004/03/31 01:27:24 mstenner Exp $ + +import time +import sys +import traceback +import string +import os +import types + +############################################################################## +# Exported classes and functions +############################################################################## +__all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner', + 'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader'] + +# Expose obsolete functions for backwards compatability +__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases']) + + +############################################################################## +# Test framework core +############################################################################## + +# All classes defined herein are 'new-style' classes, allowing use of 'super()' +__metaclass__ = type + +def _strclass(cls): + return "%s.%s" % (cls.__module__, cls.__name__) + +class TestResult: + """Holder for test result information. + + Test results are automatically managed by the TestCase and TestSuite + classes, and do not need to be explicitly manipulated by writers of tests. + + Each instance holds the total number of tests run, and collections of + failures and errors that occurred among those test runs. The collections + contain tuples of (testcase, exceptioninfo), where exceptioninfo is the + formatted traceback of the error that occurred. + """ + def __init__(self): + self.failures = [] + self.errors = [] + self.skipped = [] + self.testsRun = 0 + self.shouldStop = 0 + + def startTest(self, test): + "Called when the given test is about to be run" + self.testsRun = self.testsRun + 1 + + def stopTest(self, test): + "Called when the given test has been run" + pass + + def startSuite(self, suite): + "Called when the given suite is about to be run" + pass + + def stopSuit(self, suite): + "Called when the tiven suite has been run" + pass + + def addError(self, test, err): + """Called when an error has occurred. 'err' is a tuple of values as + returned by sys.exc_info(). + """ + self.errors.append((test, self._exc_info_to_string(err))) + + def addFailure(self, test, err): + """Called when an error has occurred. 'err' is a tuple of values as + returned by sys.exc_info().""" + self.failures.append((test, self._exc_info_to_string(err))) + + def addSuccess(self, test): + "Called when a test has completed successfully" + pass + + def addSkip(self, test, err): + "Called when the test has been skipped" + self.skipped.append((test, self._exc_info_to_string(err))) + + def wasSuccessful(self): + "Tells whether or not this result was a success" + return len(self.failures) == len(self.errors) == 0 + + def stop(self): + "Indicates that the tests should be aborted" + self.shouldStop = 1 + + def _exc_info_to_string(self, err): + """Converts a sys.exc_info()-style tuple of values into a string.""" + return string.join(traceback.format_exception(*err), '') + + def __repr__(self): + return "<%s run=%i errors=%i failures=%i>" % \ + (_strclass(self.__class__), self.testsRun, len(self.errors), + len(self.failures)) + + +class TestCase: + """A class whose instances are single test cases. + + By default, the test code itself should be placed in a method named + 'runTest'. + + If the fixture may be used for many test cases, create as + many test methods as are needed. When instantiating such a TestCase + subclass, specify in the constructor arguments the name of the test method + that the instance is to execute. + + Test authors should subclass TestCase for their own tests. Construction + and deconstruction of the test's environment ('fixture') can be + implemented by overriding the 'setUp' and 'tearDown' methods respectively. + + If it is necessary to override the __init__ method, the base class + __init__ method must always be called. It is important that subclasses + should not change the signature of their __init__ method, since instances + of the classes are instantiated automatically by parts of the framework + in order to be run. + """ + + # This attribute determines which exception will be raised when + # the instance's assertion methods fail; test methods raising this + # exception will be deemed to have 'failed' rather than 'errored' + + failureException = AssertionError + + # test methods raising the following exception will be considered + # skipped - this is neither pass, fail, or error. it should be + # used when some resource needed to perform the test isn't avialable, + # or when a lengthy test is deliberately skipped for time. + + class skipException(Exception): pass + + # whether receiving KeyboardInterrupt during setUp or the test causes + # the test to be interpreted as skipped. The default is no. It's + # probably best to do: + # except KeyboardInterrupt: self.skip() + # inside the test method + + interrupt_skips = 0 + + def __init__(self, methodName='runTest'): + """Create an instance of the class that will use the named test + method when executed. Raises a ValueError if the instance does + not have a method with the specified name. + """ + try: + self._testMethodName = methodName + testMethod = getattr(self, methodName) + self._testMethodDoc = testMethod.__doc__ + except AttributeError: + raise ValueError, "no such test method in %s: %s" % \ + (self.__class__, methodName) + + def setUp(self): + "Hook method for setting up the test fixture before exercising it." + pass + + def tearDown(self): + "Hook method for deconstructing the test fixture after testing it." + pass + + def countTestCases(self): + return 1 + + def defaultTestResult(self): + return TestResult() + + def shortDescription(self): + """Returns a one-line description of the test, or None if no + description has been provided. + + The default implementation of this method returns the first line of + the specified test method's docstring. + """ + doc = self._testMethodDoc + return doc and string.strip(string.split(doc, "\n")[0]) or None + + def id(self): + return "%s.%s" % (_strclass(self.__class__), self._testMethodName) + + def __str__(self): + return "%s (%s)" % (self._testMethodName, _strclass(self.__class__)) + + def __repr__(self): + return "<%s testMethod=%s>" % \ + (_strclass(self.__class__), self._testMethodName) + + def run(self, result=None): + return self(result) + + def __call__(self, result=None): + if result is None: result = self.defaultTestResult() + result.startTest(self) + testMethod = getattr(self, self._testMethodName) + try: + try: + self.setUp() + except KeyboardInterrupt: + if self.interrupt_skips: + result.addSkip(self, self._exc_info()) + return + else: + raise + except self.skipException: + result.addSkip(self, self._exc_info()) + return + except: + result.addError(self, self._exc_info()) + return + + ok = 0 + try: + testMethod() + ok = 1 + except self.failureException: + result.addFailure(self, self._exc_info()) + except KeyboardInterrupt: + if self.interrupt_skips: + result.addSkip(self, self._exc_info()) + return + else: + raise + except self.skipException: + result.addSkip(self, self._exc_info()) + return + except: + result.addError(self, self._exc_info()) + + try: + self.tearDown() + except KeyboardInterrupt: + raise + except: + result.addError(self, self._exc_info()) + ok = 0 + if ok: result.addSuccess(self) + finally: + result.stopTest(self) + + def debug(self): + """Run the test without collecting errors in a TestResult""" + self.setUp() + getattr(self, self._testMethodName)() + self.tearDown() + + def _exc_info(self): + """Return a version of sys.exc_info() with the traceback frame + minimised; usually the top level of the traceback frame is not + needed. + """ + exctype, excvalue, tb = sys.exc_info() + if sys.platform[:4] == 'java': ## tracebacks look different in Jython + return (exctype, excvalue, tb) + newtb = tb.tb_next + if newtb is None: + return (exctype, excvalue, tb) + return (exctype, excvalue, newtb) + + def fail(self, msg=None): + """Fail immediately, with the given message.""" + raise self.failureException, msg + + def failIf(self, expr, msg=None): + "Fail the test if the expression is true." + if expr: raise self.failureException, msg + + def failUnless(self, expr, msg=None): + """Fail the test unless the expression is true.""" + if not expr: raise self.failureException, msg + + def failUnlessRaises(self, excClass, callableObj, *args, **kwargs): + """Fail unless an exception of class excClass is thrown + by callableObj when invoked with arguments args and keyword + arguments kwargs. If a different type of exception is + thrown, it will not be caught, and the test case will be + deemed to have suffered an error, exactly as for an + unexpected exception. + """ + try: + callableObj(*args, **kwargs) + except excClass: + return + else: + if hasattr(excClass,'__name__'): excName = excClass.__name__ + else: excName = str(excClass) + raise self.failureException, excName + + def failUnlessEqual(self, first, second, msg=None): + """Fail if the two objects are unequal as determined by the '==' + operator. + """ + if not first == second: + raise self.failureException, \ + (msg or '%s != %s' % (`first`, `second`)) + + def failIfEqual(self, first, second, msg=None): + """Fail if the two objects are equal as determined by the '==' + operator. + """ + if first == second: + raise self.failureException, \ + (msg or '%s == %s' % (`first`, `second`)) + + def failUnlessAlmostEqual(self, first, second, places=7, msg=None): + """Fail if the two objects are unequal as determined by their + difference rounded to the given number of decimal places + (default 7) and comparing to zero. + + Note that decimal places (from zero) is usually not the same + as significant digits (measured from the most signficant digit). + """ + if round(second-first, places) != 0: + raise self.failureException, \ + (msg or '%s != %s within %s places' % (`first`, `second`, `places` )) + + def failIfAlmostEqual(self, first, second, places=7, msg=None): + """Fail if the two objects are equal as determined by their + difference rounded to the given number of decimal places + (default 7) and comparing to zero. + + Note that decimal places (from zero) is usually not the same + as significant digits (measured from the most signficant digit). + """ + if round(second-first, places) == 0: + raise self.failureException, \ + (msg or '%s == %s within %s places' % (`first`, `second`, `places`)) + + assertEqual = assertEquals = failUnlessEqual + + assertNotEqual = assertNotEquals = failIfEqual + + assertAlmostEqual = assertAlmostEquals = failUnlessAlmostEqual + + assertNotAlmostEqual = assertNotAlmostEquals = failIfAlmostEqual + + assertRaises = failUnlessRaises + + assert_ = failUnless + + def skip(self, msg=None): + """Skip the test""" + raise self.skipException, msg + + def skipIf(self, expr, msg=None): + "Skip the test if the expression is true." + if expr: raise self.skipException, msg + + def skipUnless(self, expr, msg=None): + """Skip the test unless the expression is true.""" + if not expr: raise self.skipException, msg + + + +class TestSuite: + """A test suite is a composite test consisting of a number of TestCases. + + For use, create an instance of TestSuite, then add test case instances. + When all tests have been added, the suite can be passed to a test + runner, such as TextTestRunner. It will run the individual test cases + in the order in which they were added, aggregating the results. When + subclassing, do not forget to call the base class constructor. + """ + def __init__(self, tests=(), description=None): + self._tests = [] + self.addTests(tests) + self.description = description or '(no description)' + + def __repr__(self): + return "<%s tests=%s>" % (_strclass(self.__class__), self._tests) + + __str__ = __repr__ + + def shortDescription(self): + return self.description + + def setDescription(self, description): + self.description = description + + def countTestCases(self): + cases = 0 + for test in self._tests: + cases = cases + test.countTestCases() + return cases + + def addTest(self, test): + self._tests.append(test) + + def addTests(self, tests): + for test in tests: + self.addTest(test) + + def run(self, result): + return self(result) + + def __call__(self, result): + try: result.startSuite(self) + except AttributeError: pass + + for test in self._tests: + if result.shouldStop: + break + test(result) + + try: result.endSuite(self) + except AttributeError: pass + + return result + + def debug(self): + """Run the tests without collecting errors in a TestResult""" + for test in self._tests: test.debug() + + +class FunctionTestCase(TestCase): + """A test case that wraps a test function. + + This is useful for slipping pre-existing test functions into the + PyUnit framework. Optionally, set-up and tidy-up functions can be + supplied. As with TestCase, the tidy-up ('tearDown') function will + always be called if the set-up ('setUp') function ran successfully. + """ + + def __init__(self, testFunc, setUp=None, tearDown=None, + description=None): + TestCase.__init__(self) + self._setUpFunc = setUp + self._tearDownFunc = tearDown + self._testFunc = testFunc + self._description = description + + def setUp(self): + if self._setUpFunc is not None: + self._setUpFunc() + + def tearDown(self): + if self._tearDownFunc is not None: + self._tearDownFunc() + + def runTest(self): + self._testFunc() + + def id(self): + return self._testFunc.__name__ + + def __str__(self): + return "%s (%s)" % (_strclass(self.__class__), self._testFunc.__name__) + + def __repr__(self): + return "<%s testFunc=%s>" % (_strclass(self.__class__), self._testFunc) + + def shortDescription(self): + if self._description is not None: return self._description + doc = self._testFunc.__doc__ + return doc and string.strip(string.split(doc, "\n")[0]) or None + + + +############################################################################## +# Locating and loading tests +############################################################################## + +class TestLoader: + """This class is responsible for loading tests according to various + criteria and returning them wrapped in a Test + """ + testMethodPrefix = 'test' + sortTestMethodsUsing = cmp + suiteClass = TestSuite + + def loadTestsFromTestCase(self, testCaseClass): + """Return a suite of all tests cases contained in testCaseClass""" + name_list = self.getTestCaseNames(testCaseClass) + instance_list = map(testCaseClass, name_list) + description = getattr(testCaseClass, '__doc__') \ + or testCaseClass.__name__ + description = (description.splitlines()[0]).strip() + suite = self.suiteClass(instance_list, description) + return suite + + def loadTestsFromModule(self, module): + """Return a suite of all tests cases contained in the given module""" + tests = [] + for name in dir(module): + obj = getattr(module, name) + if (isinstance(obj, (type, types.ClassType)) and + issubclass(obj, TestCase) and + not obj in [TestCase, FunctionTestCase]): + tests.append(self.loadTestsFromTestCase(obj)) + description = getattr(module, '__doc__') \ + or module.__name__ + description = (description.splitlines()[0]).strip() + return self.suiteClass(tests, description) + + def loadTestsFromName(self, name, module=None): + """Return a suite of all tests cases given a string specifier. + + The name may resolve either to a module, a test case class, a + test method within a test case class, or a callable object which + returns a TestCase or TestSuite instance. + + The method optionally resolves the names relative to a given module. + """ + parts = string.split(name, '.') + if module is None: + if not parts: + raise ValueError, "incomplete test name: %s" % name + else: + parts_copy = parts[:] + while parts_copy: + try: + module = __import__(string.join(parts_copy,'.')) + break + except ImportError: + del parts_copy[-1] + if not parts_copy: raise + parts = parts[1:] + obj = module + for part in parts: + obj = getattr(obj, part) + + import unittest + if type(obj) == types.ModuleType: + return self.loadTestsFromModule(obj) + elif (isinstance(obj, (type, types.ClassType)) and + issubclass(obj, unittest.TestCase)): + return self.loadTestsFromTestCase(obj) + elif type(obj) == types.UnboundMethodType: + return obj.im_class(obj.__name__) + elif callable(obj): + test = obj() + if not isinstance(test, unittest.TestCase) and \ + not isinstance(test, unittest.TestSuite): + raise ValueError, \ + "calling %s returned %s, not a test" % (obj,test) + return test + else: + raise ValueError, "don't know how to make test from: %s" % obj + + def loadTestsFromNames(self, names, module=None): + """Return a suite of all tests cases found using the given sequence + of string specifiers. See 'loadTestsFromName()'. + """ + suites = [] + for name in names: + suites.append(self.loadTestsFromName(name, module)) + return self.suiteClass(suites) + + def getTestCaseNames(self, testCaseClass): + """Return a sorted sequence of method names found within testCaseClass + """ + testFnNames = filter(lambda n,p=self.testMethodPrefix: n[:len(p)] == p, + dir(testCaseClass)) + for baseclass in testCaseClass.__bases__: + for testFnName in self.getTestCaseNames(baseclass): + if testFnName not in testFnNames: # handle overridden methods + testFnNames.append(testFnName) + if self.sortTestMethodsUsing: + testFnNames.sort(self.sortTestMethodsUsing) + return testFnNames + + + +defaultTestLoader = TestLoader() + + +############################################################################## +# Patches for old functions: these functions should be considered obsolete +############################################################################## + +def _makeLoader(prefix, sortUsing, suiteClass=None): + loader = TestLoader() + loader.sortTestMethodsUsing = sortUsing + loader.testMethodPrefix = prefix + if suiteClass: loader.suiteClass = suiteClass + return loader + +def getTestCaseNames(testCaseClass, prefix, sortUsing=cmp): + return _makeLoader(prefix, sortUsing).getTestCaseNames(testCaseClass) + +def makeSuite(testCaseClass, prefix='test', sortUsing=cmp, suiteClass=TestSuite): + return _makeLoader(prefix, sortUsing, suiteClass).loadTestsFromTestCase(testCaseClass) + +def findTestCases(module, prefix='test', sortUsing=cmp, suiteClass=TestSuite): + return _makeLoader(prefix, sortUsing, suiteClass).loadTestsFromModule(module) + + +############################################################################## +# Text UI +############################################################################## + +class _WritelnDecorator: + """Used to decorate file-like objects with a handy 'writeln' method""" + def __init__(self,stream): + self.stream = stream + + def __getattr__(self, attr): + return getattr(self.stream,attr) + + def write(self, arg): + self.stream.write(arg) + self.stream.flush() + + def writeln(self, arg=None): + if arg: self.write(arg) + self.write('\n') # text-mode streams translate to \r\n if needed + + +class _TextTestResult(TestResult): + """A test result class that can print formatted text results to a stream. + + Used by TextTestRunner. + """ + separator1 = '=' * 79 + separator2 = '-' * 79 + + def __init__(self, stream, descriptions, verbosity): + TestResult.__init__(self) + self.stream = stream + self.showAll = verbosity > 1 + self.dots = verbosity == 1 + self.descriptions = descriptions + if descriptions: self.indent = ' ' + else: self.indent = '' + self.depth = 0 + self.width = 80 + + def getDescription(self, test): + if self.descriptions: + return test.shortDescription() or str(test) + else: + return str(test) + + def startSuite(self, suite): + if self.showAll and self.descriptions: + self.stream.write(self.indent * self.depth) + try: desc = self.getDescription(suite) + except AttributeError: desc = '(no description)' + self.stream.writeln(desc) + self.depth += 1 + + def startTest(self, test): + TestResult.startTest(self, test) + if self.showAll: + self.stream.write(self.indent * self.depth) + d = self.getDescription(test) + dwidth = self.width - len(self.indent) * self.depth - 11 + format = "%%-%is" % dwidth + self.stream.write(format % d) + self.stream.write(" ... ") + + def addSuccess(self, test): + TestResult.addSuccess(self, test) + if self.showAll: + self.stream.writeln("ok") + elif self.dots: + self.stream.write('.') + + def addError(self, test, err): + TestResult.addError(self, test, err) + if self.showAll: + self.stream.writeln("ERROR") + elif self.dots: + self.stream.write('E') + + def addFailure(self, test, err): + TestResult.addFailure(self, test, err) + if self.showAll: + self.stream.writeln("FAIL") + elif self.dots: + self.stream.write('F') + + def addSkip(self, test, err): + TestResult.addSkip(self, test, err) + if self.showAll: + self.stream.writeln("skip") + elif self.dots: + self.stream.write('s') + + def endSuite(self, suite): + self.depth -= 1 + + def printErrors(self): + if self.dots or self.showAll: + self.stream.writeln() + self.printErrorList('ERROR', self.errors) + self.printErrorList('FAIL', self.failures) + + def printErrorList(self, flavour, errors): + for test, err in errors: + self.stream.writeln(self.separator1) + self.stream.writeln("%s: %s" % (flavour,self.getDescription(test))) + self.stream.writeln(self.separator2) + self.stream.writeln("%s" % err) + + +class TextTestRunner: + """A test runner class that displays results in textual form. + + It prints out the names of tests as they are run, errors as they + occur, and a summary of the results at the end of the test run. + """ + def __init__(self, stream=sys.stderr, descriptions=1, verbosity=1): + self.stream = _WritelnDecorator(stream) + self.descriptions = descriptions + self.verbosity = verbosity + + def _makeResult(self): + return _TextTestResult(self.stream, self.descriptions, self.verbosity) + + def run(self, test): + "Run the given test case or test suite." + result = self._makeResult() + startTime = time.time() + test(result) + stopTime = time.time() + timeTaken = float(stopTime - startTime) + result.printErrors() + self.stream.writeln(result.separator2) + run = result.testsRun + self.stream.writeln("Ran %d test%s in %.3fs" % + (run, run != 1 and "s" or "", timeTaken)) + self.stream.writeln() + if not result.wasSuccessful(): + self.stream.write("FAILED (") + failed, errored, skipped = map(len, \ + (result.failures, result.errors, result.skipped)) + if failed: + self.stream.write("failures=%d" % failed) + if errored: + if failed: self.stream.write(", ") + self.stream.write("errors=%d" % errored) + if skipped: + self.stream.write(", skipped=%d" % skipped) + self.stream.writeln(")") + else: + if result.skipped: + self.stream.writeln("OK (skipped=%d)" % len(result.skipped)) + else: + self.stream.writeln("OK") + return result + + + +############################################################################## +# Facilities for running tests from the command line +############################################################################## + +class TestProgram: + """A command-line program that runs a set of tests; this is primarily + for making test modules conveniently executable. + """ + USAGE = """\ +Usage: %(progName)s [options] [test] [...] + +Options: + -h, --help Show this message + -v, --verbose Verbose output + -q, --quiet Minimal output + +Examples: + %(progName)s - run default set of tests + %(progName)s MyTestSuite - run suite 'MyTestSuite' + %(progName)s MyTestCase.testSomething - run MyTestCase.testSomething + %(progName)s MyTestCase - run all 'test*' test methods + in MyTestCase +""" + def __init__(self, module='__main__', defaultTest=None, + argv=None, testRunner=None, testLoader=defaultTestLoader): + if type(module) == type(''): + self.module = __import__(module) + for part in string.split(module,'.')[1:]: + self.module = getattr(self.module, part) + else: + self.module = module + if argv is None: + argv = sys.argv + self.verbosity = 1 + self.defaultTest = defaultTest + self.testRunner = testRunner + self.testLoader = testLoader + self.progName = os.path.basename(argv[0]) + self.parseArgs(argv) + self.runTests() + + def usageExit(self, msg=None): + if msg: print msg + print self.USAGE % self.__dict__ + sys.exit(2) + + def parseArgs(self, argv): + import getopt + try: + options, args = getopt.getopt(argv[1:], 'hHvq', + ['help','verbose','quiet']) + for opt, value in options: + if opt in ('-h','-H','--help'): + self.usageExit() + if opt in ('-q','--quiet'): + self.verbosity = 0 + if opt in ('-v','--verbose'): + self.verbosity = 2 + if len(args) == 0 and self.defaultTest is None: + self.test = self.testLoader.loadTestsFromModule(self.module) + return + if len(args) > 0: + self.testNames = args + else: + self.testNames = (self.defaultTest,) + self.createTests() + except getopt.error, msg: + self.usageExit(msg) + + def createTests(self): + self.test = self.testLoader.loadTestsFromNames(self.testNames, + self.module) + + def runTests(self): + if self.testRunner is None: + self.testRunner = TextTestRunner(verbosity=self.verbosity) + result = self.testRunner.run(self.test) + sys.exit(not result.wasSuccessful()) + +main = TestProgram + + +############################################################################## +# Executing this module from the command line +############################################################################## + +if __name__ == "__main__": + main(module=None) diff --git a/test/runtests.py b/test/runtests.py new file mode 100644 index 0000000..c48bd1d --- /dev/null +++ b/test/runtests.py @@ -0,0 +1,60 @@ +#!/usr/bin/python + +"""Usage: python runtests.py [OPTIONS] +Quick script to run all unit tests from source directory +(e.g. without having to install.) + +OPTIONS: + + -d, --descriptions=NUM Set to 0 to turn off printing + test doc strings as descriptions. + -v, --verbosity=NUM Output verbosity level. Defaults to + 2 which is one line of info per test. Set + to 1 to get one char of info per test + or 0 to disable status output completely. +""" + +# $Id: runtests.py,v 1.7 2004/03/31 17:02:00 mstenner Exp $ + +import sys +from os.path import dirname, join as joinpath +from getopt import getopt +from base_test_code import * + +def main(): + # setup sys.path so that we can run this from the source + # directory. + (descriptions, verbosity) = parse_args() + dn = dirname(sys.argv[0]) + sys.path.insert(0, joinpath(dn,'..')) + sys.path.insert(0, dn) + # it's okay to import now that sys.path is setup. + import test_grabber, test_byterange, test_mirror + suite = TestSuite( (test_grabber.suite(), + test_byterange.suite(), + test_mirror.suite()) ) + suite.description = 'urlgrabber tests' + runner = TextTestRunner(stream=sys.stdout, + descriptions=descriptions, + verbosity=verbosity) + runner.run(suite) + +def parse_args(): + descriptions = 1 + verbosity = 2 + opts, args = getopt(sys.argv[1:],'hd:v:',['descriptions=','help','verbosity=']) + for o,a in opts: + if o in ('-h', '--help'): + usage() + sys.exit(0) + elif o in ('-d', '--descriptions'): + descriptions = int(a) + elif o in ('-v', '--verbosity'): + verbosity = int(a) + return (descriptions,verbosity) + +def usage(): + print __doc__ + +if __name__ == '__main__': + main() diff --git a/test/test_byterange.py b/test/test_byterange.py new file mode 100644 index 0000000..96f1573 --- /dev/null +++ b/test/test_byterange.py @@ -0,0 +1,162 @@ +#!/usr/bin/python -t + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""byterange.py tests""" + +# $Id: test_byterange.py,v 1.6 2004/03/31 17:02:00 mstenner Exp $ + +import sys + +from StringIO import StringIO +from urlgrabber.byterange import RangeableFileObject + +from base_test_code import * + +class RangeableFileObjectTestCase(TestCase): + """Test range.RangeableFileObject class""" + + def setUp(self): + # 0 1 2 3 4 5 6 7 8 9 + # 0123456789012345678901234567890123456789012345678901234567 890123456789012345678901234567890 + self.test = 'Why cannot we write the entire 24 volumes of Encyclopaedia\nBrittanica on the head of a pin?\n' + self.fo = StringIO(self.test) + self.rfo = RangeableFileObject(self.fo, (20,69)) + + def tearDown(self): + pass + + def test_seek(self): + """RangeableFileObject.seek()""" + self.rfo.seek(11) + self.assertEquals('24', self.rfo.read(2)) + self.rfo.seek(14) + self.assertEquals('volumes', self.rfo.read(7)) + self.rfo.seek(1,1) + self.assertEquals('of', self.rfo.read(2)) + + def test_poor_mans_seek(self): + """RangeableFileObject.seek() poor mans version.. + + We just delete the seek method from StringIO so we can + excercise RangeableFileObject when the file object supplied + doesn't support seek. + """ + seek = StringIO.seek + del(StringIO.seek) + self.test_seek() + StringIO.seek = seek + + def test_read(self): + """RangeableFileObject.read()""" + self.assertEquals('the', self.rfo.read(3)) + self.assertEquals(' entire 24 volumes of ', self.rfo.read(22)) + self.assertEquals('Encyclopaedia\nBrittanica', self.rfo.read(50)) + self.assertEquals('', self.rfo.read()) + + def test_readall(self): + """RangeableFileObject.read(): to end of file.""" + rfo = RangeableFileObject(StringIO(self.test),(11,)) + self.assertEquals(self.test[11:],rfo.read()) + + def test_readline(self): + """RangeableFileObject.readline()""" + self.assertEquals('the entire 24 volumes of Encyclopaedia\n', self.rfo.readline()) + self.assertEquals('Brittanica', self.rfo.readline()) + self.assertEquals('', self.rfo.readline()) + + def test_tell(self): + """RangeableFileObject.tell()""" + self.assertEquals(0,self.rfo.tell()) + self.rfo.read(5) + self.assertEquals(5,self.rfo.tell()) + self.rfo.readline() + self.assertEquals(39,self.rfo.tell()) + +class RangeModuleTestCase(TestCase): + """Test module level functions defined in range.py""" + def setUp(self): + pass + + def tearDown(self): + pass + + def test_range_tuple_normalize(self): + """byterange.range_tuple_normalize()""" + from urlgrabber.byterange import range_tuple_normalize + from urlgrabber.byterange import RangeError + tests = ( + ((None,50), (0,50)), + ((500,600), (500,600)), + ((500,), (500,'')), + ((500,None), (500,'')), + (('',''), None), + ((0,), None), + (None, None) + ) + for test, ex in tests: + self.assertEquals( range_tuple_normalize(test), ex ) + + try: range_tuple_normalize( (10,8) ) + except RangeError: pass + else: self.fail("range_tuple_normalize( (10,8) ) should have raised RangeError") + + def test_range_header_to_tuple(self): + """byterange.range_header_to_tuple()""" + from urlgrabber.byterange import range_header_to_tuple + tests = ( + ('bytes=500-600', (500,601)), + ('bytes=500-', (500,'')), + ('bla bla', ()), + (None, None) + ) + for test, ex in tests: + self.assertEquals( range_header_to_tuple(test), ex ) + + def test_range_tuple_to_header(self): + """byterange.range_tuple_to_header()""" + from urlgrabber.byterange import range_tuple_to_header + tests = ( + ((500,600), 'bytes=500-599'), + ((500,''), 'bytes=500-'), + ((500,), 'bytes=500-'), + ((None,500), 'bytes=0-499'), + (('',500), 'bytes=0-499'), + (None, None), + ) + for test, ex in tests: + self.assertEquals( range_tuple_to_header(test), ex ) + + try: range_tuple_to_header( ('not an int',500) ) + except ValueError: pass + else: self.fail("range_tuple_to_header( ('not an int',500) ) should have raised ValueError") + + try: range_tuple_to_header( (0,'not an int') ) + except ValueError: pass + else: self.fail("range_tuple_to_header( (0, 'not an int') ) should have raised ValueError") + +def suite(): + tl = TestLoader() + return tl.loadTestsFromModule(sys.modules[__name__]) + +if __name__ == '__main__': + runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) + runner.run(suite()) + diff --git a/test/test_grabber.py b/test/test_grabber.py new file mode 100644 index 0000000..eecdbcf --- /dev/null +++ b/test/test_grabber.py @@ -0,0 +1,607 @@ +#!/usr/bin/python -t + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""grabber.py tests""" + +# $Id: test_grabber.py,v 1.31 2006/12/08 00:14:16 mstenner Exp $ + +import sys +import os +import string, tempfile, random, cStringIO, os +import urllib2 +import socket + +from base_test_code import * + +import urlgrabber +import urlgrabber.grabber as grabber +from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject, \ + URLParser +from urlgrabber.progress import text_progress_meter + +class FileObjectTests(TestCase): + + def setUp(self): + self.filename = tempfile.mktemp() + fo = file(self.filename, 'wb') + fo.write(reference_data) + fo.close() + + self.fo_input = cStringIO.StringIO(reference_data) + self.fo_output = cStringIO.StringIO() + (url, parts) = grabber.default_grabber.opts.urlparser.parse( + self.filename, grabber.default_grabber.opts) + self.wrapper = grabber.PyCurlFileObject( + url, self.fo_output, grabber.default_grabber.opts) + + def tearDown(self): + self.wrapper.close() + os.unlink(self.filename) + + def test_readall(self): + "PYCurlFileObject .read() method" + s = self.wrapper.read() + self.fo_output.write(s) + self.assert_(reference_data == self.fo_output.getvalue()) + + def test_readline(self): + "PyCurlFileObject .readline() method" + while 1: + s = self.wrapper.readline() + self.fo_output.write(s) + if not s: break + self.assert_(reference_data == self.fo_output.getvalue()) + + def test_readlines(self): + "PyCurlFileObject .readlines() method" + li = self.wrapper.readlines() + self.fo_output.write(string.join(li, '')) + self.assert_(reference_data == self.fo_output.getvalue()) + + def test_smallread(self): + "PyCurlFileObject .read(N) with small N" + while 1: + s = self.wrapper.read(23) + self.fo_output.write(s) + if not s: break + self.assert_(reference_data == self.fo_output.getvalue()) + +class HTTPTests(TestCase): + def test_reference_file(self): + "download refernce file via HTTP" + filename = tempfile.mktemp() + grabber.urlgrab(ref_http, filename) + + fo = file(filename, 'rb') + contents = fo.read() + fo.close() + + self.assert_(contents == reference_data) + + def test_post(self): + "do an HTTP post" + headers = (('Content-type', 'text/plain'),) + ret = grabber.urlread(base_http + 'test_post.php', + data=short_reference_data, + http_headers=headers) + + self.assertEqual(ret, short_reference_data) + +class URLGrabberModuleTestCase(TestCase): + """Test module level functions defined in grabber.py""" + def setUp(self): + pass + + def tearDown(self): + pass + + def test_urlopen(self): + "module-level urlopen() function" + fo = urlgrabber.urlopen('http://www.python.org') + fo.close() + + def test_urlgrab(self): + "module-level urlgrab() function" + outfile = tempfile.mktemp() + filename = urlgrabber.urlgrab('http://www.python.org', + filename=outfile) + os.unlink(outfile) + + def test_urlread(self): + "module-level urlread() function" + s = urlgrabber.urlread('http://www.python.org') + + +class URLGrabberTestCase(TestCase): + """Test grabber.URLGrabber class""" + + def setUp(self): + + self.meter = text_progress_meter( fo=cStringIO.StringIO() ) + pass + + def tearDown(self): + pass + + def testKeywordArgs(self): + """grabber.URLGrabber.__init__() **kwargs handling. + + This is a simple test that just passes some arbitrary + values into the URLGrabber constructor and checks that + they've been set properly. + """ + opener = urllib2.OpenerDirector() + g = URLGrabber( progress_obj=self.meter, + throttle=0.9, + bandwidth=20, + retry=20, + retrycodes=[5,6,7], + copy_local=1, + close_connection=1, + user_agent='test ua/1.0', + proxies={'http' : 'http://www.proxy.com:9090'}, + opener=opener ) + opts = g.opts + self.assertEquals( opts.progress_obj, self.meter ) + self.assertEquals( opts.throttle, 0.9 ) + self.assertEquals( opts.bandwidth, 20 ) + self.assertEquals( opts.retry, 20 ) + self.assertEquals( opts.retrycodes, [5,6,7] ) + self.assertEquals( opts.copy_local, 1 ) + self.assertEquals( opts.close_connection, 1 ) + self.assertEquals( opts.user_agent, 'test ua/1.0' ) + self.assertEquals( opts.proxies, {'http' : 'http://www.proxy.com:9090'} ) + self.assertEquals( opts.opener, opener ) + + nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5, + copy_local=0) + self.assertEquals( nopts.progress_obj, self.meter ) + self.assertEquals( nopts.throttle, 0.5 ) + self.assertEquals( nopts.bandwidth, 20 ) + self.assertEquals( nopts.retry, 20 ) + self.assertEquals( nopts.retrycodes, [5,6,7] ) + self.assertEquals( nopts.copy_local, 0 ) + self.assertEquals( nopts.close_connection, 1 ) + self.assertEquals( nopts.user_agent, 'test ua/1.0' ) + self.assertEquals( nopts.proxies, {'http' : 'http://www.proxy.com:9090'} ) + nopts.opener = None + self.assertEquals( nopts.opener, None ) + + def test_make_callback(self): + """grabber.URLGrabber._make_callback() tests""" + def cb(e): pass + tup_cb = (cb, ('stuff'), {'some': 'dict'}) + g = URLGrabber() + self.assertEquals(g._make_callback(cb), (cb, (), {})) + self.assertEquals(g._make_callback(tup_cb), tup_cb) + +class URLParserTestCase(TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_parse_url_with_prefix(self): + """grabber.URLParser.parse() with opts.prefix""" + base = 'http://foo.com/dir' + bases = [base, base+'/'] + filename = 'bar/baz' + target = base + '/' + filename + + for b in bases: + g = URLGrabber(prefix=b) + (url, parts) = g.opts.urlparser.parse(filename, g.opts) + self.assertEquals(url, target) + + def _test_url(self, urllist): + g = URLGrabber() + try: quote = urllist[3] + except IndexError: quote = None + g.opts.quote = quote + (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts) + + if 1: + self.assertEquals(url, urllist[1]) + self.assertEquals(parts, urllist[2]) + else: + if url == urllist[1] and parts == urllist[2]: + print 'OK: %s' % urllist[0] + else: + print 'ERROR: %s' % urllist[0] + print ' ' + urllist[1] + print ' ' + url + print ' ' + urllist[2] + print ' ' + parts + + + url_tests_all = ( + ['http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash', + 'http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash', + ('http', 'host.com', '/path/basename.ext', '', + 'arg1=val1&arg2=val2', 'hash')], + ['http://host.com/Path With Spaces/', + 'http://host.com/Path%20With%20Spaces/', + ('http', 'host.com', '/Path%20With%20Spaces/', '', '', '')], + ['http://host.com/Already%20Quoted', + 'http://host.com/Already%20Quoted', + ('http', 'host.com', '/Already%20Quoted', '', '', '')], + ['http://host.com/Should Be Quoted', + 'http://host.com/Should Be Quoted', + ('http', 'host.com', '/Should Be Quoted', '', '', ''), 0], + ['http://host.com/Should%20Not', + 'http://host.com/Should%2520Not', + ('http', 'host.com', '/Should%2520Not', '', '', ''), 1], + ) + + url_tests_posix = ( + ['/etc/passwd', + 'file:///etc/passwd', + ('file', '', '/etc/passwd', '', '', '')], + ) + + url_tests_nt = ( + [r'\\foo.com\path\file.ext', + 'file://foo.com/path/file.ext', + ('file', '', '//foo.com/path/file.ext', '', '', '')], + [r'C:\path\file.ext', + 'file:///C|/path/file.ext', + ('file', '', '/C|/path/file.ext', '', '', '')], + ) + + def test_url_parser_all_os(self): + """test url parsing common to all OSs""" + for f in self.url_tests_all: + self._test_url(f) + + def test_url_parser_posix(self): + """test url parsing on posix systems""" + if not os.name == 'posix': + self.skip() + for f in self.url_tests_posix: + self._test_url(f) + + def test_url_parser_nt(self): + """test url parsing on windows systems""" + if not os.name == 'nt': + self.skip() + for f in self.url_tests_nt: + self._test_url(f) + + +class FailureTestCase(TestCase): + """Test failure behavior""" + + def _failure_callback(self, obj, *args, **kwargs): + self.failure_callback_called = 1 + self.obj = obj + self.args = args + self.kwargs = kwargs + + def test_failure_callback_called(self): + "failure callback is called on retry" + self.failure_callback_called = 0 + g = grabber.URLGrabber(retry=2, retrycodes=[14], + failure_callback=self._failure_callback) + try: g.urlgrab(ref_404) + except URLGrabError: pass + self.assertEquals(self.failure_callback_called, 1) + + def test_failure_callback_args(self): + "failure callback is called with the proper args" + fc = (self._failure_callback, ('foo',), {'bar': 'baz'}) + g = grabber.URLGrabber(retry=2, retrycodes=[14], + failure_callback=fc) + try: g.urlgrab(ref_404) + except URLGrabError: pass + self.assert_(hasattr(self, 'obj')) + self.assert_(hasattr(self, 'args')) + self.assert_(hasattr(self, 'kwargs')) + self.assertEquals(self.args, ('foo',)) + self.assertEquals(self.kwargs, {'bar': 'baz'}) + self.assert_(isinstance(self.obj, CallbackObject)) + self.assertEquals(self.obj.url, ref_404) + self.assert_(isinstance(self.obj.exception, URLGrabError)) + del self.obj + +class InterruptTestCase(TestCase): + """Test interrupt callback behavior""" + + class InterruptProgress: + def start(self, *args, **kwargs): pass + def update(self, *args, **kwargs): raise KeyboardInterrupt + def end(self, *args, **kwargs): pass + + class TestException(Exception): pass + + def _interrupt_callback(self, obj, *args, **kwargs): + self.interrupt_callback_called = 1 + self.obj = obj + self.args = args + self.kwargs = kwargs + if kwargs.get('exception', None): + raise kwargs['exception'] + + def test_interrupt_callback_called(self): + "interrupt callback is called on retry" + self.interrupt_callback_called = 0 + ic = (self._interrupt_callback, (), {}) + g = grabber.URLGrabber(progress_obj=self.InterruptProgress(), + interrupt_callback=ic) + try: g.urlgrab(ref_http) + except KeyboardInterrupt: pass + self.assertEquals(self.interrupt_callback_called, 1) + + def test_interrupt_callback_raises(self): + "interrupt callback raises an exception" + ic = (self._interrupt_callback, (), + {'exception': self.TestException()}) + g = grabber.URLGrabber(progress_obj=self.InterruptProgress(), + interrupt_callback=ic) + self.assertRaises(self.TestException, g.urlgrab, ref_http) + +class CheckfuncTestCase(TestCase): + """Test checkfunc behavior""" + + def setUp(self): + cf = (self._checkfunc, ('foo',), {'bar': 'baz'}) + self.g = grabber.URLGrabber(checkfunc=cf) + self.filename = tempfile.mktemp() + self.data = short_reference_data + + def tearDown(self): + try: os.unlink(self.filename) + except: pass + if hasattr(self, 'obj'): del self.obj + + def _checkfunc(self, obj, *args, **kwargs): + self.obj = obj + self.args = args + self.kwargs = kwargs + + if hasattr(obj, 'filename'): + # we used urlgrab + fo = file(obj.filename) + data = fo.read() + fo.close() + else: + # we used urlread + data = obj.data + + if data == self.data: return + else: raise URLGrabError(-2, "data doesn't match") + + def _check_common_args(self): + "check the args that are common to both urlgrab and urlread" + self.assert_(hasattr(self, 'obj')) + self.assert_(hasattr(self, 'args')) + self.assert_(hasattr(self, 'kwargs')) + self.assertEquals(self.args, ('foo',)) + self.assertEquals(self.kwargs, {'bar': 'baz'}) + self.assert_(isinstance(self.obj, CallbackObject)) + self.assertEquals(self.obj.url, short_ref_http) + + def test_checkfunc_urlgrab_args(self): + "check for proper args when used with urlgrab" + self.g.urlgrab(short_ref_http, self.filename) + self._check_common_args() + self.assertEquals(self.obj.filename, self.filename) + + def test_checkfunc_urlread_args(self): + "check for proper args when used with urlread" + self.g.urlread(short_ref_http) + self._check_common_args() + self.assertEquals(self.obj.data, short_reference_data) + + def test_checkfunc_urlgrab_success(self): + "check success with urlgrab checkfunc" + self.data = short_reference_data + self.g.urlgrab(short_ref_http, self.filename) + + def test_checkfunc_urlread_success(self): + "check success with urlread checkfunc" + self.data = short_reference_data + self.g.urlread(short_ref_http) + + def test_checkfunc_urlgrab_failure(self): + "check failure with urlgrab checkfunc" + self.data = 'other data' + self.assertRaises(URLGrabError, self.g.urlgrab, + short_ref_http, self.filename) + + def test_checkfunc_urlread_failure(self): + "check failure with urlread checkfunc" + self.data = 'other data' + self.assertRaises(URLGrabError, self.g.urlread, + short_ref_http) + +class RegetTestBase: + def setUp(self): + self.ref = short_reference_data + self.grabber = grabber.URLGrabber(reget='check_timestamp') + self.filename = tempfile.mktemp() + self.hl = len(self.ref) / 2 + self.url = 'OVERRIDE THIS' + + def tearDown(self): + try: os.unlink(self.filename) + except: pass + + def _make_half_zero_file(self): + fo = file(self.filename, 'wb') + fo.write('0'*self.hl) + fo.close() + + def _read_file(self): + fo = file(self.filename, 'rb') + data = fo.read() + fo.close() + return data + +class CommonRegetTests(RegetTestBase, TestCase): + def test_bad_reget_type(self): + "exception raised for illegal reget mode" + self.assertRaises(URLGrabError, self.grabber.urlgrab, + self.url, self.filename, reget='junk') + +class FTPRegetTests(RegetTestBase, TestCase): + def setUp(self): + RegetTestBase.setUp(self) + self.url = short_ref_ftp + # this tests to see if the server is available. If it's not, + # then these tests will be skipped + try: + fo = urllib2.urlopen(self.url).close() + except IOError: + self.skip() + + def test_basic_reget(self): + 'simple (forced) reget' + self._make_half_zero_file() + self.grabber.urlgrab(self.url, self.filename, reget='simple') + data = self._read_file() + + self.assertEquals(data[:self.hl], '0'*self.hl) + self.assertEquals(data[self.hl:], self.ref[self.hl:]) + +class HTTPRegetTests(FTPRegetTests): + def setUp(self): + RegetTestBase.setUp(self) + self.url = short_ref_http + + def test_older_check_timestamp(self): + try: + # define this here rather than in the FTP tests because currently, + # we get no timestamp information back from ftp servers. + self._make_half_zero_file() + ts = 1600000000 # set local timestamp to 2020 + os.utime(self.filename, (ts, ts)) + self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp') + data = self._read_file() + + self.assertEquals(data[:self.hl], '0'*self.hl) + self.assertEquals(data[self.hl:], self.ref[self.hl:]) + except NotImplementedError: + self.skip() + + def test_newer_check_timestamp(self): + try: + # define this here rather than in the FTP tests because currently, + # we get no timestamp information back from ftp servers. + self._make_half_zero_file() + ts = 1 # set local timestamp to 1969 + os.utime(self.filename, (ts, ts)) + self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp') + data = self._read_file() + + self.assertEquals(data, self.ref) + except: + self.skip() + +class FileRegetTests(HTTPRegetTests): + def setUp(self): + self.ref = short_reference_data + tmp = tempfile.mktemp() + tmpfo = file(tmp, 'wb') + tmpfo.write(self.ref) + tmpfo.close() + self.tmp = tmp + + (url, parts) = grabber.default_grabber.opts.urlparser.parse( + tmp, grabber.default_grabber.opts) + self.url = url + + self.grabber = grabber.URLGrabber(reget='check_timestamp', + copy_local=1) + self.filename = tempfile.mktemp() + self.hl = len(self.ref) / 2 + + def tearDown(self): + try: os.unlink(self.filename) + except: pass + try: os.unlink(self.tmp) + except: pass + +class ProFTPDSucksTests(TestCase): + def setUp(self): + self.url = ref_proftp + try: + fo = urllib2.urlopen(self.url).close() + except IOError: + self.skip() + + def test_restart_workaround(self): + inst = grabber.URLGrabber() + rslt = inst.urlread(self.url, range=(500, 1000)) + +class BaseProxyTests(TestCase): + good_p = '%s://%s:%s@%s:%i' % (proxy_proto, proxy_user, + good_proxy_pass, proxy_host, proxy_port) + bad_p = '%s://%s:%s@%s:%i' % (proxy_proto, proxy_user, + bad_proxy_pass, proxy_host, proxy_port) + good_proxies = {'ftp': good_p, 'http': good_p} + bad_proxies = {'ftp': bad_p, 'http': bad_p} + + def have_proxy(self): + have_proxy = 1 + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((proxy_host, proxy_port)) + s.close() + except socket.error: + have_proxy = 0 + return have_proxy + + +class ProxyHTTPAuthTests(BaseProxyTests): + def setUp(self): + self.url = ref_http + if not self.have_proxy(): + self.skip() + self.g = URLGrabber() + + def test_good_password(self): + self.g.urlopen(self.url, proxies=self.good_proxies) + + def test_bad_password(self): + self.assertRaises(URLGrabError, self.g.urlopen, + self.url, proxies=self.bad_proxies) + +class ProxyFTPAuthTests(ProxyHTTPAuthTests): + def setUp(self): + self.url = ref_ftp + if not self.have_proxy(): + self.skip() + try: + fo = urllib2.urlopen(self.url).close() + except IOError: + self.skip() + self.g = URLGrabber() + +def suite(): + tl = TestLoader() + return tl.loadTestsFromModule(sys.modules[__name__]) + +if __name__ == '__main__': + grabber.DEBUG = 0 + runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) + runner.run(suite()) + diff --git a/test/test_mirror.py b/test/test_mirror.py new file mode 100644 index 0000000..70fe069 --- /dev/null +++ b/test/test_mirror.py @@ -0,0 +1,275 @@ +#!/usr/bin/python -t + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""mirror.py tests""" + +# $Id: test_mirror.py,v 1.12 2005/10/22 21:57:27 mstenner Exp $ + +import sys +import os +import string, tempfile, random, cStringIO, os + +import urlgrabber.grabber +from urlgrabber.grabber import URLGrabber, URLGrabError +import urlgrabber.mirror +from urlgrabber.mirror import MirrorGroup, MGRandomStart, MGRandomOrder + +from base_test_code import * + +class FakeLogger: + def __init__(self): + self.logs = [] + def debug(self, msg, *args): + self.logs.append(msg % args) + warn = warning = info = error = debug + +class BasicTests(TestCase): + def setUp(self): + self.g = URLGrabber() + fullmirrors = [base_mirror_url + m + '/' for m in good_mirrors] + self.mg = MirrorGroup(self.g, fullmirrors) + + def test_urlgrab(self): + """MirrorGroup.urlgrab""" + filename = tempfile.mktemp() + url = 'short_reference' + self.mg.urlgrab(url, filename) + + fo = open(filename) + data = fo.read() + fo.close() + + self.assertEqual(data, short_reference_data) + + def test_urlread(self): + """MirrorGroup.urlread""" + url = 'short_reference' + data = self.mg.urlread(url) + + self.assertEqual(data, short_reference_data) + + def test_urlopen(self): + """MirrorGroup.urlopen""" + url = 'short_reference' + fo = self.mg.urlopen(url) + data = fo.read() + fo.close() + + self.assertEqual(data, short_reference_data) + +class SubclassTests(TestCase): + def setUp(self): + self.g = URLGrabber() + self.fullmirrors = [base_mirror_url + m + '/' for m in good_mirrors] + + def fetchwith(self, mgclass): + self.mg = mgclass(self.g, self.fullmirrors) + + filename = tempfile.mktemp() + url = 'short_reference' + self.mg.urlgrab(url, filename) + + fo = open(filename) + data = fo.read() + fo.close() + + self.assertEqual(data, short_reference_data) + + def test_MGRandomStart(self): + "MGRandomStart.urlgrab" + self.fetchwith(MGRandomStart) + + def test_MGRandomOrder(self): + "MGRandomOrder.urlgrab" + self.fetchwith(MGRandomOrder) + +class CallbackTests(TestCase): + def setUp(self): + self.g = URLGrabber() + fullmirrors = [base_mirror_url + m + '/' for m in \ + (bad_mirrors + good_mirrors)] + self.mg = MirrorGroup(self.g, fullmirrors) + + def test_failure_callback(self): + "test that MG executes the failure callback correctly" + tricky_list = [] + def failure_callback(cb_obj, tl): + tl.append(str(cb_obj.exception)) + self.mg.failure_callback = failure_callback, (tricky_list, ), {} + data = self.mg.urlread('reference') + self.assert_(data == reference_data) + self.assertEquals(tricky_list[0][:25], + '[Errno 14] HTTP Error 403') + + def test_callback_reraise(self): + "test that the callback can correctly re-raise the exception" + def failure_callback(cb_obj): raise cb_obj.exception + self.mg.failure_callback = failure_callback + self.assertRaises(URLGrabError, self.mg.urlread, 'reference') + +class BadMirrorTests(TestCase): + def setUp(self): + self.g = URLGrabber() + fullmirrors = [base_mirror_url + m + '/' for m in bad_mirrors] + self.mg = MirrorGroup(self.g, fullmirrors) + + def test_simple_grab(self): + """test that a bad mirror raises URLGrabError""" + filename = tempfile.mktemp() + url = 'reference' + self.assertRaises(URLGrabError, self.mg.urlgrab, url, filename) + +class FailoverTests(TestCase): + def setUp(self): + self.g = URLGrabber() + fullmirrors = [base_mirror_url + m + '/' for m in \ + (bad_mirrors + good_mirrors)] + self.mg = MirrorGroup(self.g, fullmirrors) + + def test_simple_grab(self): + """test that a the MG fails over past a bad mirror""" + filename = tempfile.mktemp() + url = 'reference' + elist = [] + def cb(e, elist=elist): elist.append(e) + self.mg.urlgrab(url, filename, failure_callback=cb) + + fo = open(filename) + contents = fo.read() + fo.close() + + # first be sure that the first mirror failed and that the + # callback was called + self.assertEqual(len(elist), 1) + # now be sure that the second mirror succeeded and the correct + # data was returned + self.assertEqual(contents, reference_data) + +class FakeGrabber: + def __init__(self, resultlist=None): + self.resultlist = resultlist or [] + self.index = 0 + self.calls = [] + + def urlgrab(self, url, filename=None, **kwargs): + self.calls.append( (url, filename) ) + res = self.resultlist[self.index] + self.index += 1 + if isinstance(res, Exception): raise res + else: return res + +class ActionTests(TestCase): + def setUp(self): + self.snarfed_logs = [] + self.db = urlgrabber.mirror.DEBUG + urlgrabber.mirror.DEBUG = FakeLogger() + self.mirrors = ['a', 'b', 'c', 'd', 'e', 'f'] + self.g = FakeGrabber([URLGrabError(3), URLGrabError(3), 'filename']) + self.mg = MirrorGroup(self.g, self.mirrors) + + def tearDown(self): + urlgrabber.mirror.DEBUG = self.db + + def test_defaults(self): + 'test default action policy' + self.mg.urlgrab('somefile') + expected_calls = [ (m + '/' + 'somefile', None) \ + for m in self.mirrors[:3] ] + expected_logs = \ + ['MIRROR: trying somefile -> a/somefile', + 'MIRROR: failed', + 'GR mirrors: [b c d e f] 0', + 'MAIN mirrors: [a b c d e f] 1', + 'MIRROR: trying somefile -> b/somefile', + 'MIRROR: failed', + 'GR mirrors: [c d e f] 0', + 'MAIN mirrors: [a b c d e f] 2', + 'MIRROR: trying somefile -> c/somefile'] + + self.assertEquals(self.g.calls, expected_calls) + self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) + + def test_instance_action(self): + 'test the effects of passed-in default_action' + self.mg.default_action = {'remove_master': 1} + self.mg.urlgrab('somefile') + expected_calls = [ (m + '/' + 'somefile', None) \ + for m in self.mirrors[:3] ] + expected_logs = \ + ['MIRROR: trying somefile -> a/somefile', + 'MIRROR: failed', + 'GR mirrors: [b c d e f] 0', + 'MAIN mirrors: [b c d e f] 0', + 'MIRROR: trying somefile -> b/somefile', + 'MIRROR: failed', + 'GR mirrors: [c d e f] 0', + 'MAIN mirrors: [c d e f] 0', + 'MIRROR: trying somefile -> c/somefile'] + + self.assertEquals(self.g.calls, expected_calls) + self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) + + def test_method_action(self): + 'test the effects of method-level default_action' + self.mg.urlgrab('somefile', default_action={'remove_master': 1}) + expected_calls = [ (m + '/' + 'somefile', None) \ + for m in self.mirrors[:3] ] + expected_logs = \ + ['MIRROR: trying somefile -> a/somefile', + 'MIRROR: failed', + 'GR mirrors: [b c d e f] 0', + 'MAIN mirrors: [b c d e f] 0', + 'MIRROR: trying somefile -> b/somefile', + 'MIRROR: failed', + 'GR mirrors: [c d e f] 0', + 'MAIN mirrors: [c d e f] 0', + 'MIRROR: trying somefile -> c/somefile'] + + self.assertEquals(self.g.calls, expected_calls) + self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) + + + def callback(self, e): return {'fail': 1} + + def test_callback_action(self): + 'test the effects of a callback-returned action' + self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile', + failure_callback=self.callback) + expected_calls = [ (m + '/' + 'somefile', None) \ + for m in self.mirrors[:1] ] + expected_logs = \ + ['MIRROR: trying somefile -> a/somefile', + 'MIRROR: failed', + 'GR mirrors: [b c d e f] 0', + 'MAIN mirrors: [a b c d e f] 1'] + + self.assertEquals(self.g.calls, expected_calls) + self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) + + +def suite(): + tl = TestLoader() + return tl.loadTestsFromModule(sys.modules[__name__]) + +if __name__ == '__main__': + runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) + runner.run(suite()) + diff --git a/test/threading/batchgrabber.py b/test/threading/batchgrabber.py new file mode 100644 index 0000000..076b7ef --- /dev/null +++ b/test/threading/batchgrabber.py @@ -0,0 +1,110 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""Module for testing urlgrabber under multiple threads. + +This module can be used from the command line. Each argument is +a URL to grab. + +The BatchURLGrabber class has an interface similar to URLGrabber +but instead of pulling files when urlgrab is called, the request +is queued. Calling BatchURLGrabber.batchgrab causes all files to +be pulled in multiple threads. + +""" + +import os.path, sys +if __name__ == '__main__': + print os.path.dirname(sys.argv[0]) + sys.path.insert(0, (os.path.dirname(sys.argv[0]) or '.') + '/../..') + +from threading import Thread, Semaphore +from urlgrabber.grabber import URLGrabber, URLGrabError +from urlgrabber.progress import MultiFileMeter, TextMultiFileMeter +from time import sleep, time + +DEBUG=0 + +class BatchURLGrabber: + def __init__(self, maxthreads=5, **kwargs): + self.maxthreads = 5 + self.grabber = URLGrabber(**kwargs) + self.queue = [] + self.threads = [] + self.sem = Semaphore() + + def urlgrab(self, url, filename=None, **kwargs): + self.queue.append( (url, filename, kwargs) ) + + def batchgrab(self): + if hasattr(self.grabber.opts.progress_obj, 'start'): + self.grabber.opts.progress_obj.start(len(self.queue)) + while self.queue or self.threads: + if self.queue and (len(self.threads) < self.maxthreads): + url, filename, kwargs = self.queue[0] + del self.queue[0] + thread = Worker(self, url, filename, kwargs) + self.threads.append(thread) + if DEBUG: print "starting worker: " + url + thread.start() + else: + for t in self.threads: + if not t.isAlive(): + if DEBUG: print "cleaning up worker: " + t.url + self.threads.remove(t) + #if len(self.threads) == self.maxthreads: + # sleep(0.2) + sleep(0.2) + +class Worker(Thread): + def __init__(self, parent, url, filename, kwargs): + Thread.__init__(self) + self.parent = parent + self.url = url + self.filename = filename + self.kwargs = kwargs + + def run(self): + if DEBUG: print "worker thread started." + grabber = self.parent.grabber + progress_obj = grabber.opts.progress_obj + if isinstance(progress_obj, MultiFileMeter): + self.kwargs['progress_obj'] = progress_obj.newMeter() + try: + rslt = self.parent.grabber.urlgrab(self.url, self.filename, **self.kwargs) + except URLGrabError, e: + print '%s, %s' % (e, self.url) + +def main(): + progress_obj = None + # uncomment to play with BatchProgressMeter (doesn't work right now) + # progress_obj = TextMultiFileMeter() + g = BatchURLGrabber(keepalive=1, progress_obj=progress_obj) + for arg in sys.argv[1:]: + g.urlgrab(arg) + if DEBUG: print "before batchgrab" + try: + g.batchgrab() + except KeyboardInterrupt: + sys.exit(1) + + if DEBUG: print "after batchgrab" + +if __name__ == '__main__': + main() diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py new file mode 100644 index 0000000..ddd5204 --- /dev/null +++ b/urlgrabber/__init__.py @@ -0,0 +1,54 @@ +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko +# Copyright 2009 Red Hat, Inc - pycurl support added by Seth Vidal + + +"""A high-level cross-protocol url-grabber. + +Using urlgrabber, data can be fetched in three basic ways: + + urlgrab(url) copy the file to the local filesystem + urlopen(url) open the remote file and return a file object + (like urllib2.urlopen) + urlread(url) return the contents of the file as a string + +When using these functions (or methods), urlgrabber supports the +following features: + + * identical behavior for http://, ftp://, and file:// urls + * http keepalive - faster downloads of many files by using + only a single connection + * byte ranges - fetch only a portion of the file + * reget - for a urlgrab, resume a partial download + * progress meters - the ability to report download progress + automatically, even when using urlopen! + * throttling - restrict bandwidth usage + * retries - automatically retry a download if it fails. The + number of retries and failure types are configurable. + * authenticated server access for http and ftp + * proxy support - support for authenticated http and ftp proxies + * mirror groups - treat a list of mirrors as a single source, + automatically switching mirrors if there is a failure. +""" + +__version__ = '3.9.1' +__date__ = '2009/09/25' +__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \ + 'Ryan Tomayko <rtomayko@naeblis.cx>' \ + 'Seth Vidal <skvidal@fedoraproject.org>' +__url__ = 'http://linux.duke.edu/projects/urlgrabber/' + +from grabber import urlgrab, urlopen, urlread diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py new file mode 100644 index 0000000..3e5f3b7 --- /dev/null +++ b/urlgrabber/byterange.py @@ -0,0 +1,463 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + + +import os +import stat +import urllib +import urllib2 +import rfc822 + +DEBUG = None + +try: + from cStringIO import StringIO +except ImportError, msg: + from StringIO import StringIO + +class RangeError(IOError): + """Error raised when an unsatisfiable range is requested.""" + pass + +class HTTPRangeHandler(urllib2.BaseHandler): + """Handler that enables HTTP Range headers. + + This was extremely simple. The Range header is a HTTP feature to + begin with so all this class does is tell urllib2 that the + "206 Partial Content" reponse from the HTTP server is what we + expected. + + Example: + import urllib2 + import byterange + + range_handler = range.HTTPRangeHandler() + opener = urllib2.build_opener(range_handler) + + # install it + urllib2.install_opener(opener) + + # create Request and set Range header + req = urllib2.Request('http://www.python.org/') + req.header['Range'] = 'bytes=30-50' + f = urllib2.urlopen(req) + """ + + def http_error_206(self, req, fp, code, msg, hdrs): + # 206 Partial Content Response + r = urllib.addinfourl(fp, hdrs, req.get_full_url()) + r.code = code + r.msg = msg + return r + + def http_error_416(self, req, fp, code, msg, hdrs): + # HTTP's Range Not Satisfiable error + raise RangeError('Requested Range Not Satisfiable') + +class HTTPSRangeHandler(HTTPRangeHandler): + """ Range Header support for HTTPS. """ + + def https_error_206(self, req, fp, code, msg, hdrs): + return self.http_error_206(req, fp, code, msg, hdrs) + + def https_error_416(self, req, fp, code, msg, hdrs): + self.https_error_416(req, fp, code, msg, hdrs) + +class RangeableFileObject: + """File object wrapper to enable raw range handling. + This was implemented primarilary for handling range + specifications for file:// urls. This object effectively makes + a file object look like it consists only of a range of bytes in + the stream. + + Examples: + # expose 10 bytes, starting at byte position 20, from + # /etc/aliases. + >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) + # seek seeks within the range (to position 23 in this case) + >>> fo.seek(3) + # tell tells where your at _within the range_ (position 3 in + # this case) + >>> fo.tell() + # read EOFs if an attempt is made to read past the last + # byte in the range. the following will return only 7 bytes. + >>> fo.read(30) + """ + + def __init__(self, fo, rangetup): + """Create a RangeableFileObject. + fo -- a file like object. only the read() method need be + supported but supporting an optimized seek() is + preferable. + rangetup -- a (firstbyte,lastbyte) tuple specifying the range + to work over. + The file object provided is assumed to be at byte offset 0. + """ + self.fo = fo + (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) + self.realpos = 0 + self._do_seek(self.firstbyte) + + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. + Any attribute not found in _this_ object will be searched for + in self.fo. This includes methods.""" + if hasattr(self.fo, name): + return getattr(self.fo, name) + raise AttributeError, name + + def tell(self): + """Return the position within the range. + This is different from fo.seek in that position 0 is the + first byte position of the range tuple. For example, if + this object was created with a range tuple of (500,899), + tell() will return 0 when at byte position 500 of the file. + """ + return (self.realpos - self.firstbyte) + + def seek(self,offset,whence=0): + """Seek within the byte range. + Positioning is identical to that described under tell(). + """ + assert whence in (0, 1, 2) + if whence == 0: # absolute seek + realoffset = self.firstbyte + offset + elif whence == 1: # relative seek + realoffset = self.realpos + offset + elif whence == 2: # absolute from end of file + # XXX: are we raising the right Error here? + raise IOError('seek from end of file not supported.') + + # do not allow seek past lastbyte in range + if self.lastbyte and (realoffset >= self.lastbyte): + realoffset = self.lastbyte + + self._do_seek(realoffset - self.realpos) + + def read(self, size=-1): + """Read within the range. + This method will limit the size read based on the range. + """ + size = self._calc_read_size(size) + rslt = self.fo.read(size) + self.realpos += len(rslt) + return rslt + + def readline(self, size=-1): + """Read lines within the range. + This method will limit the size read based on the range. + """ + size = self._calc_read_size(size) + rslt = self.fo.readline(size) + self.realpos += len(rslt) + return rslt + + def _calc_read_size(self, size): + """Handles calculating the amount of data to read based on + the range. + """ + if self.lastbyte: + if size > -1: + if ((self.realpos + size) >= self.lastbyte): + size = (self.lastbyte - self.realpos) + else: + size = (self.lastbyte - self.realpos) + return size + + def _do_seek(self,offset): + """Seek based on whether wrapped object supports seek(). + offset is relative to the current position (self.realpos). + """ + assert offset >= 0 + if not hasattr(self.fo, 'seek'): + self._poor_mans_seek(offset) + else: + self.fo.seek(self.realpos + offset) + self.realpos+= offset + + def _poor_mans_seek(self,offset): + """Seek by calling the wrapped file objects read() method. + This is used for file like objects that do not have native + seek support. The wrapped objects read() method is called + to manually seek to the desired position. + offset -- read this number of bytes from the wrapped + file object. + raise RangeError if we encounter EOF before reaching the + specified offset. + """ + pos = 0 + bufsize = 1024 + while pos < offset: + if (pos + bufsize) > offset: + bufsize = offset - pos + buf = self.fo.read(bufsize) + if len(buf) != bufsize: + raise RangeError('Requested Range Not Satisfiable') + pos+= bufsize + +class FileRangeHandler(urllib2.FileHandler): + """FileHandler subclass that adds Range support. + This class handles Range headers exactly like an HTTP + server would. + """ + def open_local_file(self, req): + import mimetypes + import mimetools + host = req.get_host() + file = req.get_selector() + localfile = urllib.url2pathname(file) + stats = os.stat(localfile) + size = stats[stat.ST_SIZE] + modified = rfc822.formatdate(stats[stat.ST_MTIME]) + mtype = mimetypes.guess_type(file)[0] + if host: + host, port = urllib.splitport(host) + if port or socket.gethostbyname(host) not in self.get_names(): + raise urllib2.URLError('file not on local host') + fo = open(localfile,'rb') + brange = req.headers.get('Range',None) + brange = range_header_to_tuple(brange) + assert brange != () + if brange: + (fb,lb) = brange + if lb == '': lb = size + if fb < 0 or fb > size or lb > size: + raise RangeError('Requested Range Not Satisfiable') + size = (lb - fb) + fo = RangeableFileObject(fo, (fb,lb)) + headers = mimetools.Message(StringIO( + 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified))) + return urllib.addinfourl(fo, headers, 'file:'+file) + + +# FTP Range Support +# Unfortunately, a large amount of base FTP code had to be copied +# from urllib and urllib2 in order to insert the FTP REST command. +# Code modifications for range support have been commented as +# follows: +# -- range support modifications start/end here + +from urllib import splitport, splituser, splitpasswd, splitattr, \ + unquote, addclosehook, addinfourl +import ftplib +import socket +import sys +import mimetypes +import mimetools + +class FTPRangeHandler(urllib2.FTPHandler): + def ftp_open(self, req): + host = req.get_host() + if not host: + raise IOError, ('ftp error', 'no host given') + host, port = splitport(host) + if port is None: + port = ftplib.FTP_PORT + else: + port = int(port) + + # username/password handling + user, host = splituser(host) + if user: + user, passwd = splitpasswd(user) + else: + passwd = None + host = unquote(host) + user = unquote(user or '') + passwd = unquote(passwd or '') + + try: + host = socket.gethostbyname(host) + except socket.error, msg: + raise urllib2.URLError(msg) + path, attrs = splitattr(req.get_selector()) + dirs = path.split('/') + dirs = map(unquote, dirs) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = splitattr(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + + # -- range support modifications start here + rest = None + range_tup = range_header_to_tuple(req.headers.get('Range',None)) + assert range_tup != () + if range_tup: + (fb,lb) = range_tup + if fb > 0: rest = fb + # -- range support modifications end here + + fp, retrlen = fw.retrfile(file, type, rest) + + # -- range support modifications start here + if range_tup: + (fb,lb) = range_tup + if lb == '': + if retrlen is None or retrlen == 0: + raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') + lb = retrlen + retrlen = lb - fb + if retrlen < 0: + # beginning of range is larger than file + raise RangeError('Requested Range Not Satisfiable') + else: + retrlen = lb - fb + fp = RangeableFileObject(fp, (0,retrlen)) + # -- range support modifications end here + + headers = "" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: + headers += "Content-Type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-Length: %d\n" % retrlen + sf = StringIO(headers) + headers = mimetools.Message(sf) + return addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors, msg: + raise IOError, ('ftp error', msg), sys.exc_info()[2] + + def connect_ftp(self, user, passwd, host, port, dirs): + fw = ftpwrapper(user, passwd, host, port, dirs) + return fw + +class ftpwrapper(urllib.ftpwrapper): + # range support note: + # this ftpwrapper code is copied directly from + # urllib. The only enhancement is to add the rest + # argument and pass it on to ftp.ntransfercmd + def retrfile(self, file, type, rest=None): + self.endtransfer() + if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 + else: cmd = 'TYPE ' + type; isdir = 0 + try: + self.ftp.voidcmd(cmd) + except ftplib.all_errors: + self.init() + self.ftp.voidcmd(cmd) + conn = None + if file and not isdir: + # Use nlst to see if the file exists at all + try: + self.ftp.nlst(file) + except ftplib.error_perm, reason: + raise IOError, ('ftp error', reason), sys.exc_info()[2] + # Restore the transfer mode! + self.ftp.voidcmd(cmd) + # Try to retrieve as a file + try: + cmd = 'RETR ' + file + conn = self.ftp.ntransfercmd(cmd, rest) + except ftplib.error_perm, reason: + if str(reason)[:3] == '501': + # workaround for REST not supported error + fp, retrlen = self.retrfile(file, type) + fp = RangeableFileObject(fp, (rest,'')) + return (fp, retrlen) + elif str(reason)[:3] != '550': + raise IOError, ('ftp error', reason), sys.exc_info()[2] + if not conn: + # Set transfer mode to ASCII! + self.ftp.voidcmd('TYPE A') + # Try a directory listing + if file: cmd = 'LIST ' + file + else: cmd = 'LIST' + conn = self.ftp.ntransfercmd(cmd) + self.busy = 1 + # Pass back both a suitably decorated object and a retrieval length + return (addclosehook(conn[0].makefile('rb'), + self.endtransfer), conn[1]) + + +#################################################################### +# Range Tuple Functions +# XXX: These range tuple functions might go better in a class. + +_rangere = None +def range_header_to_tuple(range_header): + """Get a (firstbyte,lastbyte) tuple from a Range header value. + + Range headers have the form "bytes=<firstbyte>-<lastbyte>". This + function pulls the firstbyte and lastbyte values and returns + a (firstbyte,lastbyte) tuple. If lastbyte is not specified in + the header value, it is returned as an empty string in the + tuple. + + Return None if range_header is None + Return () if range_header does not conform to the range spec + pattern. + + """ + global _rangere + if range_header is None: return None + if _rangere is None: + import re + _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') + match = _rangere.match(range_header) + if match: + tup = range_tuple_normalize(match.group(1,2)) + if tup and tup[1]: + tup = (tup[0],tup[1]+1) + return tup + return () + +def range_tuple_to_header(range_tup): + """Convert a range tuple to a Range header value. + Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None + if no range is needed. + """ + if range_tup is None: return None + range_tup = range_tuple_normalize(range_tup) + if range_tup: + if range_tup[1]: + range_tup = (range_tup[0],range_tup[1] - 1) + return 'bytes=%s-%s' % range_tup + +def range_tuple_normalize(range_tup): + """Normalize a (first_byte,last_byte) range tuple. + Return a tuple whose first element is guaranteed to be an int + and whose second element will be '' (meaning: the last byte) or + an int. Finally, return None if the normalized tuple == (0,'') + as that is equivelant to retrieving the entire file. + """ + if range_tup is None: return None + # handle first byte + fb = range_tup[0] + if fb in (None,''): fb = 0 + else: fb = int(fb) + # handle last byte + try: lb = range_tup[1] + except IndexError: lb = '' + else: + if lb is None: lb = '' + elif lb != '': lb = int(lb) + # check if range is over the entire file + if (fb,lb) == (0,''): return None + # check that the range is valid + if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) + return (fb,lb) + diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py new file mode 100644 index 0000000..e090e90 --- /dev/null +++ b/urlgrabber/grabber.py @@ -0,0 +1,1730 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal + +"""A high-level cross-protocol url-grabber. + +GENERAL ARGUMENTS (kwargs) + + Where possible, the module-level default is indicated, and legal + values are provided. + + copy_local = 0 [0|1] + + ignored except for file:// urls, in which case it specifies + whether urlgrab should still make a copy of the file, or simply + point to the existing copy. The module level default for this + option is 0. + + close_connection = 0 [0|1] + + tells URLGrabber to close the connection after a file has been + transfered. This is ignored unless the download happens with the + http keepalive handler (keepalive=1). Otherwise, the connection + is left open for further use. The module level default for this + option is 0 (keepalive connections will not be closed). + + keepalive = 1 [0|1] + + specifies whether keepalive should be used for HTTP/1.1 servers + that support it. The module level default for this option is 1 + (keepalive is enabled). + + progress_obj = None + + a class instance that supports the following methods: + po.start(filename, url, basename, length, text) + # length will be None if unknown + po.update(read) # read == bytes read so far + po.end() + + text = None + + specifies alternative text to be passed to the progress meter + object. If not given, the default progress meter will use the + basename of the file. + + throttle = 1.0 + + a number - if it's an int, it's the bytes/second throttle limit. + If it's a float, it is first multiplied by bandwidth. If throttle + == 0, throttling is disabled. If None, the module-level default + (which can be set on default_grabber.throttle) is used. See + BANDWIDTH THROTTLING for more information. + + timeout = None + + a positive float expressing the number of seconds to wait for socket + operations. If the value is None or 0.0, socket operations will block + forever. Setting this option causes urlgrabber to call the settimeout + method on the Socket object used for the request. See the Python + documentation on settimeout for more information. + http://www.python.org/doc/current/lib/socket-objects.html + + bandwidth = 0 + + the nominal max bandwidth in bytes/second. If throttle is a float + and bandwidth == 0, throttling is disabled. If None, the + module-level default (which can be set on + default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for + more information. + + range = None + + a tuple of the form (first_byte, last_byte) describing a byte + range to retrieve. Either or both of the values may set to + None. If first_byte is None, byte offset 0 is assumed. If + last_byte is None, the last byte available is assumed. Note that + the range specification is python-like in that (0,10) will yeild + the first 10 bytes of the file. + + If set to None, no range will be used. + + reget = None [None|'simple'|'check_timestamp'] + + whether to attempt to reget a partially-downloaded file. Reget + only applies to .urlgrab and (obviously) only if there is a + partially downloaded file. Reget has two modes: + + 'simple' -- the local file will always be trusted. If there + are 100 bytes in the local file, then the download will always + begin 100 bytes into the requested file. + + 'check_timestamp' -- the timestamp of the server file will be + compared to the timestamp of the local file. ONLY if the + local file is newer than or the same age as the server file + will reget be used. If the server file is newer, or the + timestamp is not returned, the entire file will be fetched. + + NOTE: urlgrabber can do very little to verify that the partial + file on disk is identical to the beginning of the remote file. + You may want to either employ a custom "checkfunc" or simply avoid + using reget in situations where corruption is a concern. + + user_agent = 'urlgrabber/VERSION' + + a string, usually of the form 'AGENT/VERSION' that is provided to + HTTP servers in the User-agent header. The module level default + for this option is "urlgrabber/VERSION". + + http_headers = None + + a tuple of 2-tuples, each containing a header and value. These + will be used for http and https requests only. For example, you + can do + http_headers = (('Pragma', 'no-cache'),) + + ftp_headers = None + + this is just like http_headers, but will be used for ftp requests. + + proxies = None + + a dictionary that maps protocol schemes to proxy hosts. For + example, to use a proxy server on host "foo" port 3128 for http + and https URLs: + proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' } + note that proxy authentication information may be provided using + normal URL constructs: + proxies={ 'http' : 'http://user:host@foo:3128' } + Lastly, if proxies is None, the default environment settings will + be used. + + prefix = None + + a url prefix that will be prepended to all requested urls. For + example: + g = URLGrabber(prefix='http://foo.com/mirror/') + g.urlgrab('some/file.txt') + ## this will fetch 'http://foo.com/mirror/some/file.txt' + This option exists primarily to allow identical behavior to + MirrorGroup (and derived) instances. Note: a '/' will be inserted + if necessary, so you cannot specify a prefix that ends with a + partial file or directory name. + + opener = None + No-op when using the curl backend (default) + + cache_openers = True + No-op when using the curl backend (default) + + data = None + + Only relevant for the HTTP family (and ignored for other + protocols), this allows HTTP POSTs. When the data kwarg is + present (and not None), an HTTP request will automatically become + a POST rather than GET. This is done by direct passthrough to + urllib2. If you use this, you may also want to set the + 'Content-length' and 'Content-type' headers with the http_headers + option. Note that python 2.2 handles the case of these + badly and if you do not use the proper case (shown here), your + values will be overridden with the defaults. + + urlparser = URLParser() + + The URLParser class handles pre-processing of URLs, including + auth-handling for user/pass encoded in http urls, file handing + (that is, filenames not sent as a URL), and URL quoting. If you + want to override any of this behavior, you can pass in a + replacement instance. See also the 'quote' option. + + quote = None + + Whether or not to quote the path portion of a url. + quote = 1 -> quote the URLs (they're not quoted yet) + quote = 0 -> do not quote them (they're already quoted) + quote = None -> guess what to do + + This option only affects proper urls like 'file:///etc/passwd'; it + does not affect 'raw' filenames like '/etc/passwd'. The latter + will always be quoted as they are converted to URLs. Also, only + the path part of a url is quoted. If you need more fine-grained + control, you should probably subclass URLParser and pass it in via + the 'urlparser' option. + + ssl_ca_cert = None + + this option can be used if M2Crypto is available and will be + ignored otherwise. If provided, it will be used to create an SSL + context. If both ssl_ca_cert and ssl_context are provided, then + ssl_context will be ignored and a new context will be created from + ssl_ca_cert. + + ssl_context = None + + No-op when using the curl backend (default) + + + self.ssl_verify_peer = True + + Check the server's certificate to make sure it is valid with what our CA validates + + self.ssl_verify_host = True + + Check the server's hostname to make sure it matches the certificate DN + + self.ssl_key = None + + Path to the key the client should use to connect/authenticate with + + self.ssl_key_type = 'PEM' + + PEM or DER - format of key + + self.ssl_cert = None + + Path to the ssl certificate the client should use to to authenticate with + + self.ssl_cert_type = 'PEM' + + PEM or DER - format of certificate + + self.ssl_key_pass = None + + password to access the ssl_key + + self.size = None + + size (in bytes) or Maximum size of the thing being downloaded. + This is mostly to keep us from exploding with an endless datastream + + self.max_header_size = 2097152 + + Maximum size (in bytes) of the headers. + + +RETRY RELATED ARGUMENTS + + retry = None + + the number of times to retry the grab before bailing. If this is + zero, it will retry forever. This was intentional... really, it + was :). If this value is not supplied or is supplied but is None + retrying does not occur. + + retrycodes = [-1,2,4,5,6,7] + + a sequence of errorcodes (values of e.errno) for which it should + retry. See the doc on URLGrabError for more details on this. You + might consider modifying a copy of the default codes rather than + building yours from scratch so that if the list is extended in the + future (or one code is split into two) you can still enjoy the + benefits of the default list. You can do that with something like + this: + + retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes + if 12 not in retrycodes: + retrycodes.append(12) + + checkfunc = None + + a function to do additional checks. This defaults to None, which + means no additional checking. The function should simply return + on a successful check. It should raise URLGrabError on an + unsuccessful check. Raising of any other exception will be + considered immediate failure and no retries will occur. + + If it raises URLGrabError, the error code will determine the retry + behavior. Negative error numbers are reserved for use by these + passed in functions, so you can use many negative numbers for + different types of failure. By default, -1 results in a retry, + but this can be customized with retrycodes. + + If you simply pass in a function, it will be given exactly one + argument: a CallbackObject instance with the .url attribute + defined and either .filename (for urlgrab) or .data (for urlread). + For urlgrab, .filename is the name of the local file. For + urlread, .data is the actual string data. If you need other + arguments passed to the callback (program state of some sort), you + can do so like this: + + checkfunc=(function, ('arg1', 2), {'kwarg': 3}) + + if the downloaded file has filename /tmp/stuff, then this will + result in this call (for urlgrab): + + function(obj, 'arg1', 2, kwarg=3) + # obj.filename = '/tmp/stuff' + # obj.url = 'http://foo.com/stuff' + + NOTE: both the "args" tuple and "kwargs" dict must be present if + you use this syntax, but either (or both) can be empty. + + failure_callback = None + + The callback that gets called during retries when an attempt to + fetch a file fails. The syntax for specifying the callback is + identical to checkfunc, except for the attributes defined in the + CallbackObject instance. The attributes for failure_callback are: + + exception = the raised exception + url = the url we're trying to fetch + tries = the number of tries so far (including this one) + retry = the value of the retry option + + The callback is present primarily to inform the calling program of + the failure, but if it raises an exception (including the one it's + passed) that exception will NOT be caught and will therefore cause + future retries to be aborted. + + The callback is called for EVERY failure, including the last one. + On the last try, the callback can raise an alternate exception, + but it cannot (without severe trickiness) prevent the exception + from being raised. + + interrupt_callback = None + + This callback is called if KeyboardInterrupt is received at any + point in the transfer. Basically, this callback can have three + impacts on the fetch process based on the way it exits: + + 1) raise no exception: the current fetch will be aborted, but + any further retries will still take place + + 2) raise a URLGrabError: if you're using a MirrorGroup, then + this will prompt a failover to the next mirror according to + the behavior of the MirrorGroup subclass. It is recommended + that you raise URLGrabError with code 15, 'user abort'. If + you are NOT using a MirrorGroup subclass, then this is the + same as (3). + + 3) raise some other exception (such as KeyboardInterrupt), which + will not be caught at either the grabber or mirror levels. + That is, it will be raised up all the way to the caller. + + This callback is very similar to failure_callback. They are + passed the same arguments, so you could use the same function for + both. + +BANDWIDTH THROTTLING + + urlgrabber supports throttling via two values: throttle and + bandwidth Between the two, you can either specify and absolute + throttle threshold or specify a theshold as a fraction of maximum + available bandwidth. + + throttle is a number - if it's an int, it's the bytes/second + throttle limit. If it's a float, it is first multiplied by + bandwidth. If throttle == 0, throttling is disabled. If None, the + module-level default (which can be set with set_throttle) is used. + + bandwidth is the nominal max bandwidth in bytes/second. If throttle + is a float and bandwidth == 0, throttling is disabled. If None, the + module-level default (which can be set with set_bandwidth) is used. + + THROTTLING EXAMPLES: + + Lets say you have a 100 Mbps connection. This is (about) 10^8 bits + per second, or 12,500,000 Bytes per second. You have a number of + throttling options: + + *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float + + This will limit urlgrab to use half of your available bandwidth. + + *) set_throttle(6250000) # throttle is an int + + This will also limit urlgrab to use half of your available + bandwidth, regardless of what bandwidth is set to. + + *) set_throttle(6250000); set_throttle(1.0) # float + + Use half your bandwidth + + *) set_throttle(6250000); set_throttle(2.0) # float + + Use up to 12,500,000 Bytes per second (your nominal max bandwidth) + + *) set_throttle(6250000); set_throttle(0) # throttle = 0 + + Disable throttling - this is more efficient than a very large + throttle setting. + + *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0 + + Disable throttling - this is the default when the module is loaded. + + SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING) + + While this is flexible, it's not extremely obvious to the user. I + suggest you implement a float throttle as a percent to make the + distinction between absolute and relative throttling very explicit. + + Also, you may want to convert the units to something more convenient + than bytes/second, such as kbps or kB/s, etc. + +""" + + + +import os +import sys +import urlparse +import time +import string +import urllib +import urllib2 +import mimetools +import thread +import types +import stat +import pycurl +from ftplib import parse150 +from StringIO import StringIO +from httplib import HTTPException +import socket +from byterange import range_tuple_normalize, range_tuple_to_header, RangeError + +######################################################################## +# MODULE INITIALIZATION +######################################################################## +try: + exec('from ' + (__name__.split('.'))[0] + ' import __version__') +except: + __version__ = '???' + +######################################################################## +# functions for debugging output. These functions are here because they +# are also part of the module initialization. +DEBUG = None +def set_logger(DBOBJ): + """Set the DEBUG object. This is called by _init_default_logger when + the environment variable URLGRABBER_DEBUG is set, but can also be + called by a calling program. Basically, if the calling program uses + the logging module and would like to incorporate urlgrabber logging, + then it can do so this way. It's probably not necessary as most + internal logging is only for debugging purposes. + + The passed-in object should be a logging.Logger instance. It will + be pushed into the keepalive and byterange modules if they're + being used. The mirror module pulls this object in on import, so + you will need to manually push into it. In fact, you may find it + tidier to simply push your logging object (or objects) into each + of these modules independently. + """ + + global DEBUG + DEBUG = DBOBJ + +def _init_default_logger(logspec=None): + '''Examines the environment variable URLGRABBER_DEBUG and creates + a logging object (logging.logger) based on the contents. It takes + the form + + URLGRABBER_DEBUG=level,filename + + where "level" can be either an integer or a log level from the + logging module (DEBUG, INFO, etc). If the integer is zero or + less, logging will be disabled. Filename is the filename where + logs will be sent. If it is "-", then stdout will be used. If + the filename is empty or missing, stderr will be used. If the + variable cannot be processed or the logging module cannot be + imported (python < 2.3) then logging will be disabled. Here are + some examples: + + URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt + URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout + URLGRABBER_DEBUG=INFO # log info and higher to stderr + + This funtion is called during module initialization. It is not + intended to be called from outside. The only reason it is a + function at all is to keep the module-level namespace tidy and to + collect the code into a nice block.''' + + try: + if logspec is None: + logspec = os.environ['URLGRABBER_DEBUG'] + dbinfo = logspec.split(',') + import logging + level = logging._levelNames.get(dbinfo[0], None) + if level is None: level = int(dbinfo[0]) + if level < 1: raise ValueError() + + formatter = logging.Formatter('%(asctime)s %(message)s') + if len(dbinfo) > 1: filename = dbinfo[1] + else: filename = '' + if filename == '': handler = logging.StreamHandler(sys.stderr) + elif filename == '-': handler = logging.StreamHandler(sys.stdout) + else: handler = logging.FileHandler(filename) + handler.setFormatter(formatter) + DBOBJ = logging.getLogger('urlgrabber') + DBOBJ.addHandler(handler) + DBOBJ.setLevel(level) + except (KeyError, ImportError, ValueError): + DBOBJ = None + set_logger(DBOBJ) + +def _log_package_state(): + if not DEBUG: return + DEBUG.info('urlgrabber version = %s' % __version__) + DEBUG.info('trans function "_" = %s' % _) + +_init_default_logger() +_log_package_state() + + +# normally this would be from i18n or something like it ... +def _(st): + return st + +######################################################################## +# END MODULE INITIALIZATION +######################################################################## + + + +class URLGrabError(IOError): + """ + URLGrabError error codes: + + URLGrabber error codes (0 -- 255) + 0 - everything looks good (you should never see this) + 1 - malformed url + 2 - local file doesn't exist + 3 - request for non-file local file (dir, etc) + 4 - IOError on fetch + 5 - OSError on fetch + 6 - no content length header when we expected one + 7 - HTTPException + 8 - Exceeded read limit (for urlread) + 9 - Requested byte range not satisfiable. + 10 - Byte range requested, but range support unavailable + 11 - Illegal reget mode + 12 - Socket timeout + 13 - malformed proxy url + 14 - HTTPError (includes .code and .exception attributes) + 15 - user abort + 16 - error writing to local file + + MirrorGroup error codes (256 -- 511) + 256 - No more mirrors left to try + + Custom (non-builtin) classes derived from MirrorGroup (512 -- 767) + [ this range reserved for application-specific error codes ] + + Retry codes (< 0) + -1 - retry the download, unknown reason + + Note: to test which group a code is in, you can simply do integer + division by 256: e.errno / 256 + + Negative codes are reserved for use by functions passed in to + retrygrab with checkfunc. The value -1 is built in as a generic + retry code and is already included in the retrycodes list. + Therefore, you can create a custom check function that simply + returns -1 and the fetch will be re-tried. For more customized + retries, you can use other negative number and include them in + retry-codes. This is nice for outputting useful messages about + what failed. + + You can use these error codes like so: + try: urlgrab(url) + except URLGrabError, e: + if e.errno == 3: ... + # or + print e.strerror + # or simply + print e #### print '[Errno %i] %s' % (e.errno, e.strerror) + """ + def __init__(self, *args): + IOError.__init__(self, *args) + self.url = "No url specified" + +class CallbackObject: + """Container for returned callback data. + + This is currently a dummy class into which urlgrabber can stuff + information for passing to callbacks. This way, the prototype for + all callbacks is the same, regardless of the data that will be + passed back. Any function that accepts a callback function as an + argument SHOULD document what it will define in this object. + + It is possible that this class will have some greater + functionality in the future. + """ + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + +def urlgrab(url, filename=None, **kwargs): + """grab the file at <url> and make a local copy at <filename> + If filename is none, the basename of the url is used. + urlgrab returns the filename of the local file, which may be different + from the passed-in filename if the copy_local kwarg == 0. + + See module documentation for a description of possible kwargs. + """ + return default_grabber.urlgrab(url, filename, **kwargs) + +def urlopen(url, **kwargs): + """open the url and return a file object + If a progress object or throttle specifications exist, then + a special file object will be returned that supports them. + The file object can be treated like any other file object. + + See module documentation for a description of possible kwargs. + """ + return default_grabber.urlopen(url, **kwargs) + +def urlread(url, limit=None, **kwargs): + """read the url into a string, up to 'limit' bytes + If the limit is exceeded, an exception will be thrown. Note that urlread + is NOT intended to be used as a way of saying "I want the first N bytes" + but rather 'read the whole file into memory, but don't use too much' + + See module documentation for a description of possible kwargs. + """ + return default_grabber.urlread(url, limit, **kwargs) + + +class URLParser: + """Process the URLs before passing them to urllib2. + + This class does several things: + + * add any prefix + * translate a "raw" file to a proper file: url + * handle any http or https auth that's encoded within the url + * quote the url + + Only the "parse" method is called directly, and it calls sub-methods. + + An instance of this class is held in the options object, which + means that it's easy to change the behavior by sub-classing and + passing the replacement in. It need only have a method like: + + url, parts = urlparser.parse(url, opts) + """ + + def parse(self, url, opts): + """parse the url and return the (modified) url and its parts + + Note: a raw file WILL be quoted when it's converted to a URL. + However, other urls (ones which come with a proper scheme) may + or may not be quoted according to opts.quote + + opts.quote = 1 --> quote it + opts.quote = 0 --> do not quote it + opts.quote = None --> guess + """ + quote = opts.quote + + if opts.prefix: + url = self.add_prefix(url, opts.prefix) + + parts = urlparse.urlparse(url) + (scheme, host, path, parm, query, frag) = parts + + if not scheme or (len(scheme) == 1 and scheme in string.letters): + # if a scheme isn't specified, we guess that it's "file:" + if url[0] not in '/\\': url = os.path.abspath(url) + url = 'file:' + urllib.pathname2url(url) + parts = urlparse.urlparse(url) + quote = 0 # pathname2url quotes, so we won't do it again + + if scheme in ['http', 'https']: + parts = self.process_http(parts, url) + + if quote is None: + quote = self.guess_should_quote(parts) + if quote: + parts = self.quote(parts) + + url = urlparse.urlunparse(parts) + return url, parts + + def add_prefix(self, url, prefix): + if prefix[-1] == '/' or url[0] == '/': + url = prefix + url + else: + url = prefix + '/' + url + return url + + def process_http(self, parts, url): + (scheme, host, path, parm, query, frag) = parts + # TODO: auth-parsing here, maybe? pycurl doesn't really need it + return (scheme, host, path, parm, query, frag) + + def quote(self, parts): + """quote the URL + + This method quotes ONLY the path part. If you need to quote + other parts, you should override this and pass in your derived + class. The other alternative is to quote other parts before + passing into urlgrabber. + """ + (scheme, host, path, parm, query, frag) = parts + path = urllib.quote(path) + return (scheme, host, path, parm, query, frag) + + hexvals = '0123456789ABCDEF' + def guess_should_quote(self, parts): + """ + Guess whether we should quote a path. This amounts to + guessing whether it's already quoted. + + find ' ' -> 1 + find '%' -> 1 + find '%XX' -> 0 + else -> 1 + """ + (scheme, host, path, parm, query, frag) = parts + if ' ' in path: + return 1 + ind = string.find(path, '%') + if ind > -1: + while ind > -1: + if len(path) < ind+3: + return 1 + code = path[ind+1:ind+3].upper() + if code[0] not in self.hexvals or \ + code[1] not in self.hexvals: + return 1 + ind = string.find(path, '%', ind+1) + return 0 + return 1 + +class URLGrabberOptions: + """Class to ease kwargs handling.""" + + def __init__(self, delegate=None, **kwargs): + """Initialize URLGrabberOptions object. + Set default values for all options and then update options specified + in kwargs. + """ + self.delegate = delegate + if delegate is None: + self._set_defaults() + self._set_attributes(**kwargs) + + def __getattr__(self, name): + if self.delegate and hasattr(self.delegate, name): + return getattr(self.delegate, name) + raise AttributeError, name + + def raw_throttle(self): + """Calculate raw throttle value from throttle and bandwidth + values. + """ + if self.throttle <= 0: + return 0 + elif type(self.throttle) == type(0): + return float(self.throttle) + else: # throttle is a float + return self.bandwidth * self.throttle + + def derive(self, **kwargs): + """Create a derived URLGrabberOptions instance. + This method creates a new instance and overrides the + options specified in kwargs. + """ + return URLGrabberOptions(delegate=self, **kwargs) + + def _set_attributes(self, **kwargs): + """Update object attributes with those provided in kwargs.""" + self.__dict__.update(kwargs) + if kwargs.has_key('range'): + # normalize the supplied range value + self.range = range_tuple_normalize(self.range) + if not self.reget in [None, 'simple', 'check_timestamp']: + raise URLGrabError(11, _('Illegal reget mode: %s') \ + % (self.reget, )) + + def _set_defaults(self): + """Set all options to their default values. + When adding new options, make sure a default is + provided here. + """ + self.progress_obj = None + self.throttle = 1.0 + self.bandwidth = 0 + self.retry = None + self.retrycodes = [-1,2,4,5,6,7] + self.checkfunc = None + self.copy_local = 0 + self.close_connection = 0 + self.range = None + self.user_agent = 'urlgrabber/%s' % __version__ + self.keepalive = 1 + self.proxies = None + self.reget = None + self.failure_callback = None + self.interrupt_callback = None + self.prefix = None + self.opener = None + self.cache_openers = True + self.timeout = None + self.text = None + self.http_headers = None + self.ftp_headers = None + self.data = None + self.urlparser = URLParser() + self.quote = None + self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb + self.ssl_context = None # no-op in pycurl + self.ssl_verify_peer = True # check peer's cert for authenticityb + self.ssl_verify_host = True # make sure who they are and who the cert is for matches + self.ssl_key = None # client key + self.ssl_key_type = 'PEM' #(or DER) + self.ssl_cert = None # client cert + self.ssl_cert_type = 'PEM' # (or DER) + self.ssl_key_pass = None # password to access the key + self.size = None # if we know how big the thing we're getting is going + # to be. this is ultimately a MAXIMUM size for the file + self.max_header_size = 2097152 #2mb seems reasonable for maximum header size + + def __repr__(self): + return self.format() + + def format(self, indent=' '): + keys = self.__dict__.keys() + if self.delegate is not None: + keys.remove('delegate') + keys.sort() + s = '{\n' + for k in keys: + s = s + indent + '%-15s: %s,\n' % \ + (repr(k), repr(self.__dict__[k])) + if self.delegate: + df = self.delegate.format(indent + ' ') + s = s + indent + '%-15s: %s\n' % ("'delegate'", df) + s = s + indent + '}' + return s + +class URLGrabber: + """Provides easy opening of URLs with a variety of options. + + All options are specified as kwargs. Options may be specified when + the class is created and may be overridden on a per request basis. + + New objects inherit default values from default_grabber. + """ + + def __init__(self, **kwargs): + self.opts = URLGrabberOptions(**kwargs) + + def _retry(self, opts, func, *args): + tries = 0 + while 1: + # there are only two ways out of this loop. The second has + # several "sub-ways" + # 1) via the return in the "try" block + # 2) by some exception being raised + # a) an excepton is raised that we don't "except" + # b) a callback raises ANY exception + # c) we're not retry-ing or have run out of retries + # d) the URLGrabError code is not in retrycodes + # beware of infinite loops :) + tries = tries + 1 + exception = None + retrycode = None + callback = None + if DEBUG: DEBUG.info('attempt %i/%s: %s', + tries, opts.retry, args[0]) + try: + r = apply(func, (opts,) + args, {}) + if DEBUG: DEBUG.info('success') + return r + except URLGrabError, e: + exception = e + callback = opts.failure_callback + retrycode = e.errno + except KeyboardInterrupt, e: + exception = e + callback = opts.interrupt_callback + + if DEBUG: DEBUG.info('exception: %s', exception) + if callback: + if DEBUG: DEBUG.info('calling callback: %s', callback) + cb_func, cb_args, cb_kwargs = self._make_callback(callback) + obj = CallbackObject(exception=exception, url=args[0], + tries=tries, retry=opts.retry) + cb_func(obj, *cb_args, **cb_kwargs) + + if (opts.retry is None) or (tries == opts.retry): + if DEBUG: DEBUG.info('retries exceeded, re-raising') + raise + + if (retrycode is not None) and (retrycode not in opts.retrycodes): + if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', + retrycode, opts.retrycodes) + raise + + def urlopen(self, url, **kwargs): + """open the url and return a file object + If a progress object or throttle value specified when this + object was created, then a special file object will be + returned that supports them. The file object can be treated + like any other file object. + """ + opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) + (url,parts) = opts.urlparser.parse(url, opts) + def retryfunc(opts, url): + return PyCurlFileObject(url, filename=None, opts=opts) + return self._retry(opts, retryfunc, url) + + def urlgrab(self, url, filename=None, **kwargs): + """grab the file at <url> and make a local copy at <filename> + If filename is none, the basename of the url is used. + urlgrab returns the filename of the local file, which may be + different from the passed-in filename if copy_local == 0. + """ + opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) + (url,parts) = opts.urlparser.parse(url, opts) + (scheme, host, path, parm, query, frag) = parts + if filename is None: + filename = os.path.basename( urllib.unquote(path) ) + if scheme == 'file' and not opts.copy_local: + # just return the name of the local file - don't make a + # copy currently + path = urllib.url2pathname(path) + if host: + path = os.path.normpath('//' + host + path) + if not os.path.exists(path): + err = URLGrabError(2, + _('Local file does not exist: %s') % (path, )) + err.url = url + raise err + elif not os.path.isfile(path): + err = URLGrabError(3, + _('Not a normal file: %s') % (path, )) + err.url = url + raise err + + elif not opts.range: + if not opts.checkfunc is None: + cb_func, cb_args, cb_kwargs = \ + self._make_callback(opts.checkfunc) + obj = CallbackObject() + obj.filename = path + obj.url = url + apply(cb_func, (obj, )+cb_args, cb_kwargs) + return path + + def retryfunc(opts, url, filename): + fo = PyCurlFileObject(url, filename, opts) + try: + fo._do_grab() + if not opts.checkfunc is None: + cb_func, cb_args, cb_kwargs = \ + self._make_callback(opts.checkfunc) + obj = CallbackObject() + obj.filename = filename + obj.url = url + apply(cb_func, (obj, )+cb_args, cb_kwargs) + finally: + fo.close() + return filename + + return self._retry(opts, retryfunc, url, filename) + + def urlread(self, url, limit=None, **kwargs): + """read the url into a string, up to 'limit' bytes + If the limit is exceeded, an exception will be thrown. Note + that urlread is NOT intended to be used as a way of saying + "I want the first N bytes" but rather 'read the whole file + into memory, but don't use too much' + """ + opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) + (url,parts) = opts.urlparser.parse(url, opts) + if limit is not None: + limit = limit + 1 + + def retryfunc(opts, url, limit): + fo = PyCurlFileObject(url, filename=None, opts=opts) + s = '' + try: + # this is an unfortunate thing. Some file-like objects + # have a default "limit" of None, while the built-in (real) + # file objects have -1. They each break the other, so for + # now, we just force the default if necessary. + if limit is None: s = fo.read() + else: s = fo.read(limit) + + if not opts.checkfunc is None: + cb_func, cb_args, cb_kwargs = \ + self._make_callback(opts.checkfunc) + obj = CallbackObject() + obj.data = s + obj.url = url + apply(cb_func, (obj, )+cb_args, cb_kwargs) + finally: + fo.close() + return s + + s = self._retry(opts, retryfunc, url, limit) + if limit and len(s) > limit: + err = URLGrabError(8, + _('Exceeded limit (%i): %s') % (limit, url)) + err.url = url + raise err + + return s + + def _make_callback(self, callback_obj): + if callable(callback_obj): + return callback_obj, (), {} + else: + return callback_obj + +# create the default URLGrabber used by urlXXX functions. +# NOTE: actual defaults are set in URLGrabberOptions +default_grabber = URLGrabber() + + +class PyCurlFileObject(): + def __init__(self, url, filename, opts): + self.fo = None + self._hdr_dump = '' + self._parsed_hdr = None + self.url = url + self.scheme = urlparse.urlsplit(self.url)[0] + self.filename = filename + self.append = False + self.reget_time = None + self.opts = opts + if self.opts.reget == 'check_timestamp': + raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." + self._complete = False + self._rbuf = '' + self._rbufsize = 1024*8 + self._ttime = time.time() + self._tsize = 0 + self._amount_read = 0 + self._reget_length = 0 + self._prog_running = False + self._error = (None, None) + self.size = None + self._do_open() + + + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. + Any attribute not found in _this_ object will be searched for + in self.fo. This includes methods.""" + + if hasattr(self.fo, name): + return getattr(self.fo, name) + raise AttributeError, name + + def _retrieve(self, buf): + try: + if not self._prog_running: + if self.opts.progress_obj: + size = self.size + self._reget_length + self.opts.progress_obj.start(self._prog_reportname, + urllib.unquote(self.url), + self._prog_basename, + size=size, + text=self.opts.text) + self._prog_running = True + self.opts.progress_obj.update(self._amount_read) + + self._amount_read += len(buf) + self.fo.write(buf) + return len(buf) + except KeyboardInterrupt: + return -1 + + def _hdr_retrieve(self, buf): + if self._over_max_size(cur=len(self._hdr_dump), + max_size=self.opts.max_header_size): + return -1 + try: + self._hdr_dump += buf + # we have to get the size before we do the progress obj start + # but we can't do that w/o making it do 2 connects, which sucks + # so we cheat and stuff it in here in the hdr_retrieve + if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: + length = buf.split(':')[1] + self.size = int(length) + elif self.scheme in ['ftp']: + s = None + if buf.startswith('213 '): + s = buf[3:].strip() + elif buf.startswith('150 '): + s = parse150(buf) + if s: + self.size = int(s) + + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT + + def _return_hdr_obj(self): + if self._parsed_hdr: + return self._parsed_hdr + statusend = self._hdr_dump.find('\n') + hdrfp = StringIO() + hdrfp.write(self._hdr_dump[statusend:]) + self._parsed_hdr = mimetools.Message(hdrfp) + return self._parsed_hdr + + hdr = property(_return_hdr_obj) + http_code = property(fget= + lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE)) + + def _set_opts(self, opts={}): + # XXX + if not opts: + opts = self.opts + + + # defaults we're always going to set + self.curl_obj.setopt(pycurl.NOPROGRESS, False) + self.curl_obj.setopt(pycurl.NOSIGNAL, True) + self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve) + self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve) + self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) + self.curl_obj.setopt(pycurl.FAILONERROR, True) + self.curl_obj.setopt(pycurl.OPT_FILETIME, True) + + if DEBUG: + self.curl_obj.setopt(pycurl.VERBOSE, True) + if opts.user_agent: + self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) + + # maybe to be options later + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) + self.curl_obj.setopt(pycurl.MAXREDIRS, 5) + + # timeouts + timeout = 300 + if opts.timeout: + timeout = int(opts.timeout) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + + # ssl options + if self.scheme == 'https': + if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs + self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) + self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) + self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) + self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) + if opts.ssl_key: + self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) + if opts.ssl_key_type: + self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) + if opts.ssl_cert: + self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) + if opts.ssl_cert_type: + self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) + if opts.ssl_key_pass: + self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) + + #headers: + if opts.http_headers and self.scheme in ('http', 'https'): + headers = [] + for (tag, content) in opts.http_headers: + headers.append('%s:%s' % (tag, content)) + self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + + # ranges: + if opts.range or opts.reget: + range_str = self._build_range() + if range_str: + self.curl_obj.setopt(pycurl.RANGE, range_str) + + # throttle/bandwidth + if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): + self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) + + # proxy settings + if opts.proxies: + for (scheme, proxy) in opts.proxies.items(): + if self.scheme in ('ftp'): # only set the ftp proxy for ftp items + if scheme not in ('ftp'): + continue + else: + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) + elif self.scheme in ('http', 'https'): + if scheme not in ('http', 'https'): + continue + else: + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) + + # FIXME username/password/auth settings + + #posts - simple - expects the fields as they are + if opts.data: + self.curl_obj.setopt(pycurl.POST, True) + self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) + + # our url + self.curl_obj.setopt(pycurl.URL, self.url) + + + def _do_perform(self): + if self._complete: + return + + try: + self.curl_obj.perform() + except pycurl.error, e: + # XXX - break some of these out a bit more clearly + # to other URLGrabErrors from + # http://curl.haxx.se/libcurl/c/libcurl-errors.html + # this covers e.args[0] == 22 pretty well - which will be common + + code = self.http_code + errcode = e.args[0] + if self._error[0]: + errcode = self._error[0] + + if errcode == 23 and code >= 200 and code < 299: + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) + err.url = self.url + + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME + raise KeyboardInterrupt + + elif errcode == 28: + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + elif errcode == 35: + msg = _("problem making ssl connection") + err = URLGrabError(14, msg) + err.url = self.url + raise err + elif errcode == 37: + msg = _("Could not open/read %s") % (self.url) + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif errcode == 42: + err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) + err.url = self.url + # this is probably wrong but ultimately this is what happens + # we have a legit http code and a pycurl 'writer failed' code + # which almost always means something aborted it from outside + # since we cannot know what it is -I'm banking on it being + # a ctrl-c. XXXX - if there's a way of going back two raises to + # figure out what aborted the pycurl process FIXME + raise KeyboardInterrupt + + elif errcode == 58: + msg = _("problem with the local client certificate") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif errcode == 60: + msg = _("client cert cannot be verified or client cert incorrect") + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif errcode == 63: + if self._error[1]: + msg = self._error[1] + else: + msg = _("Max download size exceeded on %s") % (self.url) + err = URLGrabError(14, msg) + err.url = self.url + raise err + + elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it + msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) + else: + msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) + code = errcode + err = URLGrabError(14, msg) + err.code = code + err.exception = e + raise err + + def _do_open(self): + self.curl_obj = _curl_cache + self.curl_obj.reset() # reset all old settings away, just in case + # setup any ranges + self._set_opts() + self._do_grab() + return self.fo + + def _add_headers(self): + pass + + def _build_range(self): + reget_length = 0 + rt = None + if self.opts.reget and type(self.filename) in types.StringTypes: + # we have reget turned on and we're dumping to a file + try: + s = os.stat(self.filename) + except OSError: + pass + else: + self.reget_time = s[stat.ST_MTIME] + reget_length = s[stat.ST_SIZE] + + # Set initial length when regetting + self._amount_read = reget_length + self._reget_length = reget_length # set where we started from, too + + rt = reget_length, '' + self.append = 1 + + if self.opts.range: + rt = self.opts.range + if rt[0]: rt = (rt[0] + reget_length, rt[1]) + + if rt: + header = range_tuple_to_header(rt) + if header: + return header.split('=')[1] + + + + def _make_request(self, req, opener): + #XXXX + # This doesn't do anything really, but we could use this + # instead of do_open() to catch a lot of crap errors as + # mstenner did before here + return (self.fo, self.hdr) + + try: + if self.opts.timeout: + old_to = socket.getdefaulttimeout() + socket.setdefaulttimeout(self.opts.timeout) + try: + fo = opener.open(req) + finally: + socket.setdefaulttimeout(old_to) + else: + fo = opener.open(req) + hdr = fo.info() + except ValueError, e: + err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, )) + err.url = self.url + raise err + + except RangeError, e: + err = URLGrabError(9, _('%s on %s') % (e, self.url)) + err.url = self.url + raise err + except urllib2.HTTPError, e: + new_e = URLGrabError(14, _('%s on %s') % (e, self.url)) + new_e.code = e.code + new_e.exception = e + new_e.url = self.url + raise new_e + except IOError, e: + if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): + err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + else: + err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e)) + err.url = self.url + raise err + + except OSError, e: + err = URLGrabError(5, _('%s on %s') % (e, self.url)) + err.url = self.url + raise err + + except HTTPException, e: + err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \ + (e.__class__.__name__, self.url, e)) + err.url = self.url + raise err + + else: + return (fo, hdr) + + def _do_grab(self): + """dump the file to a filename or StringIO buffer""" + + if self._complete: + return + _was_filename = False + if type(self.filename) in types.StringTypes and self.filename: + _was_filename = True + self._prog_reportname = str(self.filename) + self._prog_basename = os.path.basename(self.filename) + + if self.append: mode = 'ab' + else: mode = 'wb' + + if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \ + (self.filename, mode)) + try: + self.fo = open(self.filename, mode) + except IOError, e: + err = URLGrabError(16, _(\ + 'error opening local file from %s, IOError: %s') % (self.url, e)) + err.url = self.url + raise err + + else: + self._prog_reportname = 'MEMORY' + self._prog_basename = 'MEMORY' + + + self.fo = StringIO() + # if this is to be a tempfile instead.... + # it just makes crap in the tempdir + #fh, self._temp_name = mkstemp() + #self.fo = open(self._temp_name, 'wb') + + + self._do_perform() + + + + if _was_filename: + # close it up + self.fo.flush() + self.fo.close() + # set the time + mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) + if mod_time != -1: + os.utime(self.filename, (mod_time, mod_time)) + # re open it + self.fo = open(self.filename, 'r') + else: + #self.fo = open(self._temp_name, 'r') + self.fo.seek(0) + + self._complete = True + + def _fill_buffer(self, amt=None): + """fill the buffer to contain at least 'amt' bytes by reading + from the underlying file object. If amt is None, then it will + read until it gets nothing more. It updates the progress meter + and throttles after every self._rbufsize bytes.""" + # the _rbuf test is only in this first 'if' for speed. It's not + # logically necessary + if self._rbuf and not amt is None: + L = len(self._rbuf) + if amt > L: + amt = amt - L + else: + return + + # if we've made it here, then we don't have enough in the buffer + # and we need to read more. + + if not self._complete: self._do_grab() #XXX cheater - change on ranges + + buf = [self._rbuf] + bufsize = len(self._rbuf) + while amt is None or amt: + # first, delay if necessary for throttling reasons + if self.opts.raw_throttle(): + diff = self._tsize/self.opts.raw_throttle() - \ + (time.time() - self._ttime) + if diff > 0: time.sleep(diff) + self._ttime = time.time() + + # now read some data, up to self._rbufsize + if amt is None: readamount = self._rbufsize + else: readamount = min(amt, self._rbufsize) + try: + new = self.fo.read(readamount) + except socket.error, e: + err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e)) + err.url = self.url + raise err + + except socket.timeout, e: + raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) + err.url = self.url + raise err + + except IOError, e: + raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e)) + err.url = self.url + raise err + + newsize = len(new) + if not newsize: break # no more to read + + if amt: amt = amt - newsize + buf.append(new) + bufsize = bufsize + newsize + self._tsize = newsize + self._amount_read = self._amount_read + newsize + #if self.opts.progress_obj: + # self.opts.progress_obj.update(self._amount_read) + + self._rbuf = string.join(buf, '') + return + + def _progress_update(self, download_total, downloaded, upload_total, uploaded): + if self._over_max_size(cur=self._amount_read-self._reget_length): + return -1 + + try: + if self._prog_running: + downloaded += self._reget_length + self.opts.progress_obj.update(downloaded) + except KeyboardInterrupt: + return -1 + + def _over_max_size(self, cur, max_size=None): + + if not max_size: + max_size = self.size + if self.opts.size: # if we set an opts size use that, no matter what + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb + if cur > max_size + max_size*.10: + + msg = _("Downloaded more than max size for %s: %s > %s") \ + % (self.url, cur, max_size) + self._error = (pycurl.E_FILESIZE_EXCEEDED, msg) + return True + return False + + def _to_utf8(self, obj, errors='replace'): + '''convert 'unicode' to an encoded utf-8 byte string ''' + # stolen from yum.i18n + if isinstance(obj, unicode): + obj = obj.encode('utf-8', errors) + return obj + + def read(self, amt=None): + self._fill_buffer(amt) + if amt is None: + s, self._rbuf = self._rbuf, '' + else: + s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] + return s + + def readline(self, limit=-1): + if not self._complete: self._do_grab() + return self.fo.readline() + + i = string.find(self._rbuf, '\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + L = len(self._rbuf) + self._fill_buffer(L + self._rbufsize) + if not len(self._rbuf) > L: break + i = string.find(self._rbuf, '\n', L) + + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + + s, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return s + + def close(self): + if self._prog_running: + self.opts.progress_obj.end(self._amount_read) + self.fo.close() + + +_curl_cache = pycurl.Curl() # make one and reuse it over and over and over + + +##################################################################### +# DEPRECATED FUNCTIONS +def set_throttle(new_throttle): + """Deprecated. Use: default_grabber.throttle = new_throttle""" + default_grabber.throttle = new_throttle + +def set_bandwidth(new_bandwidth): + """Deprecated. Use: default_grabber.bandwidth = new_bandwidth""" + default_grabber.bandwidth = new_bandwidth + +def set_progress_obj(new_progress_obj): + """Deprecated. Use: default_grabber.progress_obj = new_progress_obj""" + default_grabber.progress_obj = new_progress_obj + +def set_user_agent(new_user_agent): + """Deprecated. Use: default_grabber.user_agent = new_user_agent""" + default_grabber.user_agent = new_user_agent + +def retrygrab(url, filename=None, copy_local=0, close_connection=0, + progress_obj=None, throttle=None, bandwidth=None, + numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): + """Deprecated. Use: urlgrab() with the retry arg instead""" + kwargs = {'copy_local' : copy_local, + 'close_connection' : close_connection, + 'progress_obj' : progress_obj, + 'throttle' : throttle, + 'bandwidth' : bandwidth, + 'retry' : numtries, + 'retrycodes' : retrycodes, + 'checkfunc' : checkfunc + } + return urlgrab(url, filename, **kwargs) + + +##################################################################### +# TESTING +def _main_test(): + try: url, filename = sys.argv[1:3] + except ValueError: + print 'usage:', sys.argv[0], \ + '<url> <filename> [copy_local=0|1] [close_connection=0|1]' + sys.exit() + + kwargs = {} + for a in sys.argv[3:]: + k, v = string.split(a, '=', 1) + kwargs[k] = int(v) + + set_throttle(1.0) + set_bandwidth(32 * 1024) + print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, + default_grabber.bandwidth) + + try: from progress import text_progress_meter + except ImportError, e: pass + else: kwargs['progress_obj'] = text_progress_meter() + + try: name = apply(urlgrab, (url, filename), kwargs) + except URLGrabError, e: print e + else: print 'LOCAL FILE:', name + + +def _retry_test(): + try: url, filename = sys.argv[1:3] + except ValueError: + print 'usage:', sys.argv[0], \ + '<url> <filename> [copy_local=0|1] [close_connection=0|1]' + sys.exit() + + kwargs = {} + for a in sys.argv[3:]: + k, v = string.split(a, '=', 1) + kwargs[k] = int(v) + + try: from progress import text_progress_meter + except ImportError, e: pass + else: kwargs['progress_obj'] = text_progress_meter() + + def cfunc(filename, hello, there='foo'): + print hello, there + import random + rnum = random.random() + if rnum < .5: + print 'forcing retry' + raise URLGrabError(-1, 'forcing retry') + if rnum < .75: + print 'forcing failure' + raise URLGrabError(-2, 'forcing immediate failure') + print 'success' + return + + kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) + try: name = apply(retrygrab, (url, filename), kwargs) + except URLGrabError, e: print e + else: print 'LOCAL FILE:', name + +def _file_object_test(filename=None): + import cStringIO + if filename is None: + filename = __file__ + print 'using file "%s" for comparisons' % filename + fo = open(filename) + s_input = fo.read() + fo.close() + + for testfunc in [_test_file_object_smallread, + _test_file_object_readall, + _test_file_object_readline, + _test_file_object_readlines]: + fo_input = cStringIO.StringIO(s_input) + fo_output = cStringIO.StringIO() + wrapper = PyCurlFileObject(fo_input, None, 0) + print 'testing %-30s ' % testfunc.__name__, + testfunc(wrapper, fo_output) + s_output = fo_output.getvalue() + if s_output == s_input: print 'passed' + else: print 'FAILED' + +def _test_file_object_smallread(wrapper, fo_output): + while 1: + s = wrapper.read(23) + fo_output.write(s) + if not s: return + +def _test_file_object_readall(wrapper, fo_output): + s = wrapper.read() + fo_output.write(s) + +def _test_file_object_readline(wrapper, fo_output): + while 1: + s = wrapper.readline() + fo_output.write(s) + if not s: return + +def _test_file_object_readlines(wrapper, fo_output): + li = wrapper.readlines() + fo_output.write(string.join(li, '')) + +if __name__ == '__main__': + _main_test() + _retry_test() + _file_object_test('test') diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py new file mode 100644 index 0000000..dad410b --- /dev/null +++ b/urlgrabber/mirror.py @@ -0,0 +1,455 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""Module for downloading files from a pool of mirrors + +DESCRIPTION + + This module provides support for downloading files from a pool of + mirrors with configurable failover policies. To a large extent, the + failover policy is chosen by using different classes derived from + the main class, MirrorGroup. + + Instances of MirrorGroup (and cousins) act very much like URLGrabber + instances in that they have urlread, urlgrab, and urlopen methods. + They can therefore, be used in very similar ways. + + from urlgrabber.grabber import URLGrabber + from urlgrabber.mirror import MirrorGroup + gr = URLGrabber() + mg = MirrorGroup(gr, ['http://foo.com/some/directory/', + 'http://bar.org/maybe/somewhere/else/', + 'ftp://baz.net/some/other/place/entirely/'] + mg.urlgrab('relative/path.zip') + + The assumption is that all mirrors are identical AFTER the base urls + specified, so that any mirror can be used to fetch any file. + +FAILOVER + + The failover mechanism is designed to be customized by subclassing + from MirrorGroup to change the details of the behavior. In general, + the classes maintain a master mirror list and a "current mirror" + index. When a download is initiated, a copy of this list and index + is created for that download only. The specific failover policy + depends on the class used, and so is documented in the class + documentation. Note that ANY behavior of the class can be + overridden, so any failover policy at all is possible (although + you may need to change the interface in extreme cases). + +CUSTOMIZATION + + Most customization of a MirrorGroup object is done at instantiation + time (or via subclassing). There are four major types of + customization: + + 1) Pass in a custom urlgrabber - The passed in urlgrabber will be + used (by default... see #2) for the grabs, so options to it + apply for the url-fetching + + 2) Custom mirror list - Mirror lists can simply be a list of + stings mirrors (as shown in the example above) but each can + also be a dict, allowing for more options. For example, the + first mirror in the list above could also have been: + + {'mirror': 'http://foo.com/some/directory/', + 'grabber': <a custom grabber to be used for this mirror>, + 'kwargs': { <a dict of arguments passed to the grabber> }} + + All mirrors are converted to this format internally. If + 'grabber' is omitted, the default grabber will be used. If + kwargs are omitted, then (duh) they will not be used. + + 3) Pass keyword arguments when instantiating the mirror group. + See, for example, the failure_callback argument. + + 4) Finally, any kwargs passed in for the specific file (to the + urlgrab method, for example) will be folded in. The options + passed into the grabber's urlXXX methods will override any + options specified in a custom mirror dict. + +""" + + +import random +import thread # needed for locking to make this threadsafe + +from grabber import URLGrabError, CallbackObject, DEBUG + +def _(st): + return st + +class GrabRequest: + """This is a dummy class used to hold information about the specific + request. For example, a single file. By maintaining this information + separately, we can accomplish two things: + + 1) make it a little easier to be threadsafe + 2) have request-specific parameters + """ + pass + +class MirrorGroup: + """Base Mirror class + + Instances of this class are built with a grabber object and a list + of mirrors. Then all calls to urlXXX should be passed relative urls. + The requested file will be searched for on the first mirror. If the + grabber raises an exception (possibly after some retries) then that + mirror will be removed from the list, and the next will be attempted. + If all mirrors are exhausted, then an exception will be raised. + + MirrorGroup has the following failover policy: + + * downloads begin with the first mirror + + * by default (see default_action below) a failure (after retries) + causes it to increment the local AND master indices. Also, + the current mirror is removed from the local list (but NOT the + master list - the mirror can potentially be used for other + files) + + * if the local list is ever exhausted, a URLGrabError will be + raised (errno=256, no more mirrors) + + OPTIONS + + In addition to the required arguments "grabber" and "mirrors", + MirrorGroup also takes the following optional arguments: + + default_action + + A dict that describes the actions to be taken upon failure + (after retries). default_action can contain any of the + following keys (shown here with their default values): + + default_action = {'increment': 1, + 'increment_master': 1, + 'remove': 1, + 'remove_master': 0, + 'fail': 0} + + In this context, 'increment' means "use the next mirror" and + 'remove' means "never use this mirror again". The two + 'master' values refer to the instance-level mirror list (used + for all files), whereas the non-master values refer to the + current download only. + + The 'fail' option will cause immediate failure by re-raising + the exception and no further attempts to get the current + download. + + This dict can be set at instantiation time, + mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) + at method-execution time (only applies to current fetch), + filename = mg.urlgrab(url, default_action={'increment': 0}) + or by returning an action dict from the failure_callback + return {'fail':0} + in increasing precedence. + + If all three of these were done, the net result would be: + {'increment': 0, # set in method + 'increment_master': 1, # class default + 'remove': 1, # class default + 'remove_master': 0, # class default + 'fail': 0} # set at instantiation, reset + # from callback + + failure_callback + + this is a callback that will be called when a mirror "fails", + meaning the grabber raises some URLGrabError. If this is a + tuple, it is interpreted to be of the form (cb, args, kwargs) + where cb is the actual callable object (function, method, + etc). Otherwise, it is assumed to be the callable object + itself. The callback will be passed a grabber.CallbackObject + instance along with args and kwargs (if present). The following + attributes are defined withing the instance: + + obj.exception = < exception that was raised > + obj.mirror = < the mirror that was tried > + obj.relative_url = < url relative to the mirror > + obj.url = < full url that failed > + # .url is just the combination of .mirror + # and .relative_url + + The failure callback can return an action dict, as described + above. + + Like default_action, the failure_callback can be set at + instantiation time or when the urlXXX method is called. In + the latter case, it applies only for that fetch. + + The callback can re-raise the exception quite easily. For + example, this is a perfectly adequate callback function: + + def callback(obj): raise obj.exception + + WARNING: do not save the exception object (or the + CallbackObject instance). As they contain stack frame + references, they can lead to circular references. + + Notes: + * The behavior can be customized by deriving and overriding the + 'CONFIGURATION METHODS' + * The 'grabber' instance is kept as a reference, not copied. + Therefore, the grabber instance can be modified externally + and changes will take effect immediately. + """ + + # notes on thread-safety: + + # A GrabRequest should never be shared by multiple threads because + # it's never saved inside the MG object and never returned outside it. + # therefore, it should be safe to access/modify grabrequest data + # without a lock. However, accessing the mirrors and _next attributes + # of the MG itself must be done when locked to prevent (for example) + # removal of the wrong mirror. + + ############################################################## + # CONFIGURATION METHODS - intended to be overridden to + # customize behavior + def __init__(self, grabber, mirrors, **kwargs): + """Initialize the MirrorGroup object. + + REQUIRED ARGUMENTS + + grabber - URLGrabber instance + mirrors - a list of mirrors + + OPTIONAL ARGUMENTS + + failure_callback - callback to be used when a mirror fails + default_action - dict of failure actions + + See the module-level and class level documentation for more + details. + """ + + # OVERRIDE IDEAS: + # shuffle the list to randomize order + self.grabber = grabber + self.mirrors = self._parse_mirrors(mirrors) + self._next = 0 + self._lock = thread.allocate_lock() + self.default_action = None + self._process_kwargs(kwargs) + + # if these values are found in **kwargs passed to one of the urlXXX + # methods, they will be stripped before getting passed on to the + # grabber + options = ['default_action', 'failure_callback'] + + def _process_kwargs(self, kwargs): + self.failure_callback = kwargs.get('failure_callback') + self.default_action = kwargs.get('default_action') + + def _parse_mirrors(self, mirrors): + parsed_mirrors = [] + for m in mirrors: + if type(m) == type(''): m = {'mirror': m} + parsed_mirrors.append(m) + return parsed_mirrors + + def _load_gr(self, gr): + # OVERRIDE IDEAS: + # shuffle gr list + self._lock.acquire() + gr.mirrors = list(self.mirrors) + gr._next = self._next + self._lock.release() + + def _get_mirror(self, gr): + # OVERRIDE IDEAS: + # return a random mirror so that multiple mirrors get used + # even without failures. + if not gr.mirrors: + raise URLGrabError(256, _('No more mirrors to try.')) + return gr.mirrors[gr._next] + + def _failure(self, gr, cb_obj): + # OVERRIDE IDEAS: + # inspect the error - remove=1 for 404, remove=2 for connection + # refused, etc. (this can also be done via + # the callback) + cb = gr.kw.get('failure_callback') or self.failure_callback + if cb: + if type(cb) == type( () ): + cb, args, kwargs = cb + else: + args, kwargs = (), {} + action = cb(cb_obj, *args, **kwargs) or {} + else: + action = {} + # XXXX - decide - there are two ways to do this + # the first is action-overriding as a whole - use the entire action + # or fall back on module level defaults + #action = action or gr.kw.get('default_action') or self.default_action + # the other is to fall through for each element in the action dict + a = dict(self.default_action or {}) + a.update(gr.kw.get('default_action', {})) + a.update(action) + action = a + self.increment_mirror(gr, action) + if action and action.get('fail', 0): raise + + def increment_mirror(self, gr, action={}): + """Tell the mirror object increment the mirror index + + This increments the mirror index, which amounts to telling the + mirror object to use a different mirror (for this and future + downloads). + + This is a SEMI-public method. It will be called internally, + and you may never need to call it. However, it is provided + (and is made public) so that the calling program can increment + the mirror choice for methods like urlopen. For example, with + urlopen, there's no good way for the mirror group to know that + an error occurs mid-download (it's already returned and given + you the file object). + + remove --- can have several values + 0 do not remove the mirror from the list + 1 remove the mirror for this download only + 2 remove the mirror permanently + + beware of remove=0 as it can lead to infinite loops + """ + badmirror = gr.mirrors[gr._next] + + self._lock.acquire() + try: + ind = self.mirrors.index(badmirror) + except ValueError: + pass + else: + if action.get('remove_master', 0): + del self.mirrors[ind] + elif self._next == ind and action.get('increment_master', 1): + self._next += 1 + if self._next >= len(self.mirrors): self._next = 0 + self._lock.release() + + if action.get('remove', 1): + del gr.mirrors[gr._next] + elif action.get('increment', 1): + gr._next += 1 + if gr._next >= len(gr.mirrors): gr._next = 0 + + if DEBUG: + grm = [m['mirror'] for m in gr.mirrors] + DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) + selfm = [m['mirror'] for m in self.mirrors] + DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) + + ##################################################################### + # NON-CONFIGURATION METHODS + # these methods are designed to be largely workhorse methods that + # are not intended to be overridden. That doesn't mean you can't; + # if you want to, feel free, but most things can be done by + # by overriding the configuration methods :) + + def _join_url(self, base_url, rel_url): + if base_url.endswith('/') or rel_url.startswith('/'): + return base_url + rel_url + else: + return base_url + '/' + rel_url + + def _mirror_try(self, func, url, kw): + gr = GrabRequest() + gr.func = func + gr.url = url + gr.kw = dict(kw) + self._load_gr(gr) + + for k in self.options: + try: del kw[k] + except KeyError: pass + + while 1: + mirrorchoice = self._get_mirror(gr) + fullurl = self._join_url(mirrorchoice['mirror'], gr.url) + kwargs = dict(mirrorchoice.get('kwargs', {})) + kwargs.update(kw) + grabber = mirrorchoice.get('grabber') or self.grabber + func_ref = getattr(grabber, func) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + try: + return func_ref( *(fullurl,), **kwargs ) + except URLGrabError, e: + if DEBUG: DEBUG.info('MIRROR: failed') + obj = CallbackObject() + obj.exception = e + obj.mirror = mirrorchoice['mirror'] + obj.relative_url = gr.url + obj.url = fullurl + self._failure(gr, obj) + + def urlgrab(self, url, filename=None, **kwargs): + kw = dict(kwargs) + kw['filename'] = filename + func = 'urlgrab' + return self._mirror_try(func, url, kw) + + def urlopen(self, url, **kwargs): + kw = dict(kwargs) + func = 'urlopen' + return self._mirror_try(func, url, kw) + + def urlread(self, url, limit=None, **kwargs): + kw = dict(kwargs) + kw['limit'] = limit + func = 'urlread' + return self._mirror_try(func, url, kw) + + +class MGRandomStart(MirrorGroup): + """A mirror group that starts at a random mirror in the list. + + This behavior of this class is identical to MirrorGroup, except that + it starts at a random location in the mirror list. + """ + + def __init__(self, grabber, mirrors, **kwargs): + """Initialize the object + + The arguments for intialization are the same as for MirrorGroup + """ + MirrorGroup.__init__(self, grabber, mirrors, **kwargs) + self._next = random.randrange(len(mirrors)) + +class MGRandomOrder(MirrorGroup): + """A mirror group that uses mirrors in a random order. + + This behavior of this class is identical to MirrorGroup, except that + it uses the mirrors in a random order. Note that the order is set at + initialization time and fixed thereafter. That is, it does not pick a + random mirror after each failure. + """ + + def __init__(self, grabber, mirrors, **kwargs): + """Initialize the object + + The arguments for intialization are the same as for MirrorGroup + """ + MirrorGroup.__init__(self, grabber, mirrors, **kwargs) + random.shuffle(self.mirrors) + +if __name__ == '__main__': + pass diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py new file mode 100644 index 0000000..dd07c6a --- /dev/null +++ b/urlgrabber/progress.py @@ -0,0 +1,755 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + + +import sys +import time +import math +import thread +import fcntl +import struct +import termios + +# Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html +def terminal_width(fd=1): + """ Get the real terminal width """ + try: + buf = 'abcdefgh' + buf = fcntl.ioctl(fd, termios.TIOCGWINSZ, buf) + ret = struct.unpack('hhhh', buf)[1] + if ret == 0: + return 80 + # Add minimum too? + return ret + except: # IOError + return 80 + +_term_width_val = None +_term_width_last = None +def terminal_width_cached(fd=1, cache_timeout=1.000): + """ Get the real terminal width, but cache it for a bit. """ + global _term_width_val + global _term_width_last + + now = time.time() + if _term_width_val is None or (now - _term_width_last) > cache_timeout: + _term_width_val = terminal_width(fd) + _term_width_last = now + return _term_width_val + +class TerminalLine: + """ Help create dynamic progress bars, uses terminal_width_cached(). """ + + def __init__(self, min_rest=0, beg_len=None, fd=1, cache_timeout=1.000): + if beg_len is None: + beg_len = min_rest + self._min_len = min_rest + self._llen = terminal_width_cached(fd, cache_timeout) + if self._llen < beg_len: + self._llen = beg_len + self._fin = False + + def __len__(self): + """ Usable length for elements. """ + return self._llen - self._min_len + + def rest_split(self, fixed, elements=2): + """ After a fixed length, split the rest of the line length among + a number of different elements (default=2). """ + if self._llen < fixed: + return 0 + return (self._llen - fixed) / elements + + def add(self, element, full_len=None): + """ If there is room left in the line, above min_len, add element. + Note that as soon as one add fails all the rest will fail too. """ + + if full_len is None: + full_len = len(element) + if len(self) < full_len: + self._fin = True + if self._fin: + return '' + + self._llen -= len(element) + return element + + def rest(self): + """ Current rest of line, same as .rest_split(fixed=0, elements=1). """ + return self._llen + +class BaseMeter: + def __init__(self): + self.update_period = 0.3 # seconds + + self.filename = None + self.url = None + self.basename = None + self.text = None + self.size = None + self.start_time = None + self.last_amount_read = 0 + self.last_update_time = None + self.re = RateEstimator() + + def start(self, filename=None, url=None, basename=None, + size=None, now=None, text=None): + self.filename = filename + self.url = url + self.basename = basename + self.text = text + + #size = None ######### TESTING + self.size = size + if not size is None: self.fsize = format_number(size) + 'B' + + if now is None: now = time.time() + self.start_time = now + self.re.start(size, now) + self.last_amount_read = 0 + self.last_update_time = now + self._do_start(now) + + def _do_start(self, now=None): + pass + + def update(self, amount_read, now=None): + # for a real gui, you probably want to override and put a call + # to your mainloop iteration function here + if now is None: now = time.time() + if (now >= self.last_update_time + self.update_period) or \ + not self.last_update_time: + self.re.update(amount_read, now) + self.last_amount_read = amount_read + self.last_update_time = now + self._do_update(amount_read, now) + + def _do_update(self, amount_read, now=None): + pass + + def end(self, amount_read, now=None): + if now is None: now = time.time() + self.re.update(amount_read, now) + self.last_amount_read = amount_read + self.last_update_time = now + self._do_end(amount_read, now) + + def _do_end(self, amount_read, now=None): + pass + +# This is kind of a hack, but progress is gotten from grabber which doesn't +# know about the total size to download. So we do this so we can get the data +# out of band here. This will be "fixed" one way or anther soon. +_text_meter_total_size = 0 +_text_meter_sofar_size = 0 +def text_meter_total_size(size, downloaded=0): + global _text_meter_total_size + global _text_meter_sofar_size + _text_meter_total_size = size + _text_meter_sofar_size = downloaded + +# +# update: No size (minimal: 17 chars) +# ----------------------------------- +# <text> <rate> | <current size> <elapsed time> +# 8-48 1 8 3 6 1 9 5 +# +# Order: 1. <text>+<current size> (17) +# 2. +<elapsed time> (10, total: 27) +# 3. + ( 5, total: 32) +# 4. +<rate> ( 9, total: 41) +# +# update: Size, Single file +# ------------------------- +# <text> <pc> <bar> <rate> | <current size> <eta time> ETA +# 8-25 1 3-4 1 6-16 1 8 3 6 1 9 1 3 1 +# +# Order: 1. <text>+<current size> (17) +# 2. +<eta time> (10, total: 27) +# 3. +ETA ( 5, total: 32) +# 4. +<pc> ( 4, total: 36) +# 5. +<rate> ( 9, total: 45) +# 6. +<bar> ( 7, total: 52) +# +# update: Size, All files +# ----------------------- +# <text> <total pc> <pc> <bar> <rate> | <current size> <eta time> ETA +# 8-22 1 5-7 1 3-4 1 6-12 1 8 3 6 1 9 1 3 1 +# +# Order: 1. <text>+<current size> (17) +# 2. +<eta time> (10, total: 27) +# 3. +ETA ( 5, total: 32) +# 4. +<total pc> ( 5, total: 37) +# 4. +<pc> ( 4, total: 41) +# 5. +<rate> ( 9, total: 50) +# 6. +<bar> ( 7, total: 57) +# +# end +# --- +# <text> | <current size> <elapsed time> +# 8-56 3 6 1 9 5 +# +# Order: 1. <text> ( 8) +# 2. +<current size> ( 9, total: 17) +# 3. +<elapsed time> (10, total: 27) +# 4. + ( 5, total: 32) +# + +class TextMeter(BaseMeter): + def __init__(self, fo=sys.stderr): + BaseMeter.__init__(self) + self.fo = fo + + def _do_update(self, amount_read, now=None): + etime = self.re.elapsed_time() + fetime = format_time(etime) + fread = format_number(amount_read) + #self.size = None + if self.text is not None: + text = self.text + else: + text = self.basename + + ave_dl = format_number(self.re.average_rate()) + sofar_size = None + if _text_meter_total_size: + sofar_size = _text_meter_sofar_size + amount_read + sofar_pc = (sofar_size * 100) / _text_meter_total_size + + # Include text + ui_rate in minimal + tl = TerminalLine(8, 8+1+8) + ui_size = tl.add(' | %5sB' % fread) + if self.size is None: + ui_time = tl.add(' %9s' % fetime) + ui_end = tl.add(' ' * 5) + ui_rate = tl.add(' %5sB/s' % ave_dl) + out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_rate, ui_size, ui_time, ui_end) + else: + rtime = self.re.remaining_time() + frtime = format_time(rtime) + frac = self.re.fraction_read() + + ui_time = tl.add(' %9s' % frtime) + ui_end = tl.add(' ETA ') + + if sofar_size is None: + ui_sofar_pc = '' + else: + ui_sofar_pc = tl.add(' (%i%%)' % sofar_pc, + full_len=len(" (100%)")) + + ui_pc = tl.add(' %2i%%' % (frac*100)) + ui_rate = tl.add(' %5sB/s' % ave_dl) + # Make text grow a bit before we start growing the bar too + blen = 4 + tl.rest_split(8 + 8 + 4) + bar = '='*int(blen * frac) + if (blen * frac) - int(blen * frac) >= 0.5: + bar += '-' + ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar)) + out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_sofar_pc, ui_pc, ui_bar, + ui_rate, ui_size, ui_time, ui_end) + + self.fo.write(out) + self.fo.flush() + + def _do_end(self, amount_read, now=None): + global _text_meter_total_size + global _text_meter_sofar_size + + total_time = format_time(self.re.elapsed_time()) + total_size = format_number(amount_read) + if self.text is not None: + text = self.text + else: + text = self.basename + + tl = TerminalLine(8) + ui_size = tl.add(' | %5sB' % total_size) + ui_time = tl.add(' %9s' % total_time) + not_done = self.size is not None and amount_read != self.size + if not_done: + ui_end = tl.add(' ... ') + else: + ui_end = tl.add(' ' * 5) + + out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, + ui_size, ui_time, ui_end) + self.fo.write(out) + self.fo.flush() + + # Don't add size to the sofar size until we have all of it. + # If we don't have a size, then just pretend/hope we got all of it. + if not_done: + return + + if _text_meter_total_size: + _text_meter_sofar_size += amount_read + if _text_meter_total_size <= _text_meter_sofar_size: + _text_meter_total_size = 0 + _text_meter_sofar_size = 0 + +text_progress_meter = TextMeter + +class MultiFileHelper(BaseMeter): + def __init__(self, master): + BaseMeter.__init__(self) + self.master = master + + def _do_start(self, now): + self.master.start_meter(self, now) + + def _do_update(self, amount_read, now): + # elapsed time since last update + self.master.update_meter(self, now) + + def _do_end(self, amount_read, now): + self.ftotal_time = format_time(now - self.start_time) + self.ftotal_size = format_number(self.last_amount_read) + self.master.end_meter(self, now) + + def failure(self, message, now=None): + self.master.failure_meter(self, message, now) + + def message(self, message): + self.master.message_meter(self, message) + +class MultiFileMeter: + helperclass = MultiFileHelper + def __init__(self): + self.meters = [] + self.in_progress_meters = [] + self._lock = thread.allocate_lock() + self.update_period = 0.3 # seconds + + self.numfiles = None + self.finished_files = 0 + self.failed_files = 0 + self.open_files = 0 + self.total_size = None + self.failed_size = 0 + self.start_time = None + self.finished_file_size = 0 + self.last_update_time = None + self.re = RateEstimator() + + def start(self, numfiles=None, total_size=None, now=None): + if now is None: now = time.time() + self.numfiles = numfiles + self.finished_files = 0 + self.failed_files = 0 + self.open_files = 0 + self.total_size = total_size + self.failed_size = 0 + self.start_time = now + self.finished_file_size = 0 + self.last_update_time = now + self.re.start(total_size, now) + self._do_start(now) + + def _do_start(self, now): + pass + + def end(self, now=None): + if now is None: now = time.time() + self._do_end(now) + + def _do_end(self, now): + pass + + def lock(self): self._lock.acquire() + def unlock(self): self._lock.release() + + ########################################################### + # child meter creation and destruction + def newMeter(self): + newmeter = self.helperclass(self) + self.meters.append(newmeter) + return newmeter + + def removeMeter(self, meter): + self.meters.remove(meter) + + ########################################################### + # child functions - these should only be called by helpers + def start_meter(self, meter, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + self._lock.acquire() + try: + if not meter in self.in_progress_meters: + self.in_progress_meters.append(meter) + self.open_files += 1 + finally: + self._lock.release() + self._do_start_meter(meter, now) + + def _do_start_meter(self, meter, now): + pass + + def update_meter(self, meter, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + if (now >= self.last_update_time + self.update_period) or \ + not self.last_update_time: + self.re.update(self._amount_read(), now) + self.last_update_time = now + self._do_update_meter(meter, now) + + def _do_update_meter(self, meter, now): + pass + + def end_meter(self, meter, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + self._lock.acquire() + try: + try: self.in_progress_meters.remove(meter) + except ValueError: pass + self.open_files -= 1 + self.finished_files += 1 + self.finished_file_size += meter.last_amount_read + finally: + self._lock.release() + self._do_end_meter(meter, now) + + def _do_end_meter(self, meter, now): + pass + + def failure_meter(self, meter, message, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + self._lock.acquire() + try: + try: self.in_progress_meters.remove(meter) + except ValueError: pass + self.open_files -= 1 + self.failed_files += 1 + if meter.size and self.failed_size is not None: + self.failed_size += meter.size + else: + self.failed_size = None + finally: + self._lock.release() + self._do_failure_meter(meter, message, now) + + def _do_failure_meter(self, meter, message, now): + pass + + def message_meter(self, meter, message): + pass + + ######################################################## + # internal functions + def _amount_read(self): + tot = self.finished_file_size + for m in self.in_progress_meters: + tot += m.last_amount_read + return tot + + +class TextMultiFileMeter(MultiFileMeter): + def __init__(self, fo=sys.stderr): + self.fo = fo + MultiFileMeter.__init__(self) + + # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:## + def _do_update_meter(self, meter, now): + self._lock.acquire() + try: + format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \ + "time: %8.8s/%8.8s" + df = self.finished_files + tf = self.numfiles or 1 + pf = 100 * float(df)/tf + 0.49 + dd = self.re.last_amount_read + td = self.total_size + pd = 100 * (self.re.fraction_read() or 0) + 0.49 + dt = self.re.elapsed_time() + rt = self.re.remaining_time() + if rt is None: tt = None + else: tt = dt + rt + + fdd = format_number(dd) + 'B' + ftd = format_number(td) + 'B' + fdt = format_time(dt, 1) + ftt = format_time(tt, 1) + + out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt)) + self.fo.write('\r' + out) + self.fo.flush() + finally: + self._lock.release() + + def _do_end_meter(self, meter, now): + self._lock.acquire() + try: + format = "%-30.30s %6.6s %8.8s %9.9s" + fn = meter.basename + size = meter.last_amount_read + fsize = format_number(size) + 'B' + et = meter.re.elapsed_time() + fet = format_time(et, 1) + frate = format_number(size / et) + 'B/s' + + out = '%-79.79s' % (format % (fn, fsize, fet, frate)) + self.fo.write('\r' + out + '\n') + finally: + self._lock.release() + self._do_update_meter(meter, now) + + def _do_failure_meter(self, meter, message, now): + self._lock.acquire() + try: + format = "%-30.30s %6.6s %s" + fn = meter.basename + if type(message) in (type(''), type(u'')): + message = message.splitlines() + if not message: message = [''] + out = '%-79s' % (format % (fn, 'FAILED', message[0] or '')) + self.fo.write('\r' + out + '\n') + for m in message[1:]: self.fo.write(' ' + m + '\n') + self._lock.release() + finally: + self._do_update_meter(meter, now) + + def message_meter(self, meter, message): + self._lock.acquire() + try: + pass + finally: + self._lock.release() + + def _do_end(self, now): + self._do_update_meter(None, now) + self._lock.acquire() + try: + self.fo.write('\n') + self.fo.flush() + finally: + self._lock.release() + +###################################################################### +# support classes and functions + +class RateEstimator: + def __init__(self, timescale=5.0): + self.timescale = timescale + + def start(self, total=None, now=None): + if now is None: now = time.time() + self.total = total + self.start_time = now + self.last_update_time = now + self.last_amount_read = 0 + self.ave_rate = None + + def update(self, amount_read, now=None): + if now is None: now = time.time() + if amount_read == 0: + # if we just started this file, all bets are off + self.last_update_time = now + self.last_amount_read = 0 + self.ave_rate = None + return + + #print 'times', now, self.last_update_time + time_diff = now - self.last_update_time + read_diff = amount_read - self.last_amount_read + # First update, on reget is the file size + if self.last_amount_read: + self.last_update_time = now + self.ave_rate = self._temporal_rolling_ave(\ + time_diff, read_diff, self.ave_rate, self.timescale) + self.last_amount_read = amount_read + #print 'results', time_diff, read_diff, self.ave_rate + + ##################################################################### + # result methods + def average_rate(self): + "get the average transfer rate (in bytes/second)" + return self.ave_rate + + def elapsed_time(self): + "the time between the start of the transfer and the most recent update" + return self.last_update_time - self.start_time + + def remaining_time(self): + "estimated time remaining" + if not self.ave_rate or not self.total: return None + return (self.total - self.last_amount_read) / self.ave_rate + + def fraction_read(self): + """the fraction of the data that has been read + (can be None for unknown transfer size)""" + if self.total is None: return None + elif self.total == 0: return 1.0 + else: return float(self.last_amount_read)/self.total + + ######################################################################### + # support methods + def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale): + """a temporal rolling average performs smooth averaging even when + updates come at irregular intervals. This is performed by scaling + the "epsilon" according to the time since the last update. + Specifically, epsilon = time_diff / timescale + + As a general rule, the average will take on a completely new value + after 'timescale' seconds.""" + epsilon = time_diff / timescale + if epsilon > 1: epsilon = 1.0 + return self._rolling_ave(time_diff, read_diff, last_ave, epsilon) + + def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon): + """perform a "rolling average" iteration + a rolling average "folds" new data into an existing average with + some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) + a value of 0.0 means only the old value (initial value) counts, + and a value of 1.0 means only the newest value is considered.""" + + try: + recent_rate = read_diff / time_diff + except ZeroDivisionError: + recent_rate = None + if last_ave is None: return recent_rate + elif recent_rate is None: return last_ave + + # at this point, both last_ave and recent_rate are numbers + return epsilon * recent_rate + (1 - epsilon) * last_ave + + def _round_remaining_time(self, rt, start_time=15.0): + """round the remaining time, depending on its size + If rt is between n*start_time and (n+1)*start_time round downward + to the nearest multiple of n (for any counting number n). + If rt < start_time, round down to the nearest 1. + For example (for start_time = 15.0): + 2.7 -> 2.0 + 25.2 -> 25.0 + 26.4 -> 26.0 + 35.3 -> 34.0 + 63.6 -> 60.0 + """ + + if rt < 0: return 0.0 + shift = int(math.log(rt/start_time)/math.log(2)) + rt = int(rt) + if shift <= 0: return rt + return float(int(rt) >> shift << shift) + + +def format_time(seconds, use_hours=0): + if seconds is None or seconds < 0: + if use_hours: return '--:--:--' + else: return '--:--' + else: + seconds = int(seconds) + minutes = seconds / 60 + seconds = seconds % 60 + if use_hours: + hours = minutes / 60 + minutes = minutes % 60 + return '%02i:%02i:%02i' % (hours, minutes, seconds) + else: + return '%02i:%02i' % (minutes, seconds) + +def format_number(number, SI=0, space=' '): + """Turn numbers into human-readable metric-like numbers""" + symbols = ['', # (none) + 'k', # kilo + 'M', # mega + 'G', # giga + 'T', # tera + 'P', # peta + 'E', # exa + 'Z', # zetta + 'Y'] # yotta + + if SI: step = 1000.0 + else: step = 1024.0 + + thresh = 999 + depth = 0 + max_depth = len(symbols) - 1 + + # we want numbers between 0 and thresh, but don't exceed the length + # of our list. In that event, the formatting will be screwed up, + # but it'll still show the right number. + while number > thresh and depth < max_depth: + depth = depth + 1 + number = number / step + + if type(number) == type(1) or type(number) == type(1L): + # it's an int or a long, which means it didn't get divided, + # which means it's already short enough + format = '%i%s%s' + elif number < 9.95: + # must use 9.95 for proper sizing. For example, 9.99 will be + # rounded to 10.0 with the .1f format string (which is too long) + format = '%.1f%s%s' + else: + format = '%.0f%s%s' + + return(format % (float(number or 0), space, symbols[depth])) + +def _tst(fn, cur, tot, beg, size, *args): + tm = TextMeter() + text = "(%d/%d): %s" % (cur, tot, fn) + tm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, text=text) + num = beg + off = 0 + for (inc, delay) in args: + off += 1 + while num < ((size * off) / len(args)): + num += inc + tm.update(num) + time.sleep(delay) + tm.end(size) + +if __name__ == "__main__": + # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 + # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 + if len(sys.argv) >= 2 and sys.argv[1] == 'total': + text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 + + 1000000 + 10000 + 10000 + 10000 + 1000000) + _tst("sm-1.0.0-1.fc8.i386.rpm", 1, 10, 0, 1000, + (10, 0.2), (10, 0.1), (100, 0.25)) + _tst("s-1.0.1-1.fc8.i386.rpm", 2, 10, 0, 10000, + (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) + _tst("m-1.0.1-2.fc8.i386.rpm", 3, 10, 5000, 10000, + (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) + _tst("large-file-name-Foo-11.8.7-4.5.6.1.fc8.x86_64.rpm", 4, 10, 0, 1000000, + (1000, 0.2), (1000, 0.1), (10000, 0.1)) + _tst("large-file-name-Foo2-11.8.7-4.5.6.2.fc8.x86_64.rpm", 5, 10, + 500001, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1)) + _tst("large-file-name-Foo3-11.8.7-4.5.6.3.fc8.x86_64.rpm", 6, 10, + 750002, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1)) + _tst("large-file-name-Foo4-10.8.7-4.5.6.1.fc8.x86_64.rpm", 7, 10, 0, 10000, + (100, 0.1)) + _tst("large-file-name-Foo5-10.8.7-4.5.6.2.fc8.x86_64.rpm", 8, 10, + 5001, 10000, (100, 0.1)) + _tst("large-file-name-Foo6-10.8.7-4.5.6.3.fc8.x86_64.rpm", 9, 10, + 7502, 10000, (1, 0.1)) + _tst("large-file-name-Foox-9.8.7-4.5.6.1.fc8.x86_64.rpm", 10, 10, + 0, 1000000, (10, 0.5), + (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1), + (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1), + (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1), + (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1), + (100000, 0.1), (1, 0.1)) |