From a58ee624924dac6c5a210c03896d8d2f760faccc Mon Sep 17 00:00:00 2001 From: Kim Kibum Date: Mon, 21 May 2012 17:48:53 +0900 Subject: Upload Tizen:Base source --- ChangeLog | 246 ++++ LICENSE | 458 +++++++ MANIFEST | 21 + PKG-INFO | 48 + README | 31 + TODO | 50 + makefile | 51 + packaging/python-urlgrabber.changes | 101 ++ packaging/python-urlgrabber.spec | 60 + packaging/urlgrabber-HEAD.patch | 142 +++ packaging/urlgrabber-libproxy-httponly.patch | 83 ++ scripts/urlgrabber | 329 +++++ setup.py | 45 + test/base_test_code.py | 33 + test/grabberperf.py | 137 ++ test/munittest.py | 934 ++++++++++++++ test/runtests.py | 60 + test/test_byterange.py | 162 +++ test/test_grabber.py | 607 +++++++++ test/test_mirror.py | 275 ++++ test/threading/batchgrabber.py | 110 ++ urlgrabber/__init__.py | 54 + urlgrabber/byterange.py | 463 +++++++ urlgrabber/grabber.py | 1730 ++++++++++++++++++++++++++ urlgrabber/mirror.py | 455 +++++++ urlgrabber/progress.py | 755 +++++++++++ 26 files changed, 7440 insertions(+) create mode 100644 ChangeLog create mode 100644 LICENSE create mode 100644 MANIFEST create mode 100644 PKG-INFO create mode 100644 README create mode 100644 TODO create mode 100644 makefile create mode 100644 packaging/python-urlgrabber.changes create mode 100644 packaging/python-urlgrabber.spec create mode 100644 packaging/urlgrabber-HEAD.patch create mode 100644 packaging/urlgrabber-libproxy-httponly.patch create mode 100644 scripts/urlgrabber create mode 100644 setup.py create mode 100644 test/base_test_code.py create mode 100644 test/grabberperf.py create mode 100644 test/munittest.py create mode 100644 test/runtests.py create mode 100644 test/test_byterange.py create mode 100644 test/test_grabber.py create mode 100644 test/test_mirror.py create mode 100644 test/threading/batchgrabber.py create mode 100644 urlgrabber/__init__.py create mode 100644 urlgrabber/byterange.py create mode 100644 urlgrabber/grabber.py create mode 100644 urlgrabber/mirror.py create mode 100644 urlgrabber/progress.py diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..644fbdb --- /dev/null +++ b/ChangeLog @@ -0,0 +1,246 @@ +2009-09-25 Seth Vidal + + * urlgrabber/__init__.py: bump version to 3.9.1 + +2009-09-25 Seth Vidal + + * makefile: clean up everything in make clean + +2009-09-25 Seth Vidal + + * test/runtests.py, test/test_grabber.py, test/test_keepalive.py, + urlgrabber/__init__.py, urlgrabber/byterange.py, + urlgrabber/grabber.py, urlgrabber/keepalive.py, + urlgrabber/mirror.py, urlgrabber/progress.py, + urlgrabber/sslfactory.py: cleanup all the old urlgrabber urllib code + that's not being used delete sslfactory and keepalive fix up the + unittests to match the existing code + +2009-09-24 Seth Vidal + + * urlgrabber/grabber.py: update documentation for ssl options and + size/max_header_size options + +2009-09-23 Seth Vidal + + * urlgrabber/grabber.py: - fix the reget testcases (and regets in general) with the max size + check - make the errorcode more obvious when we go over the range - + obviously don't do the check if all of our max values are None (or + even 0 since that is a silly number for a Max) + +2009-09-22 Seth Vidal + + * urlgrabber/grabber.py: handle endless-data problems safely: "A + malicious server could cause libcurl to download an infinite amount + of data, potentially causing all of memory or disk to be filled. + Setting the CURLOPT_MAXFILESIZE_LARGE option is not sufficient to + guard against this. Instead, the app should monitor the amount of + data received within the write or progress callback and abort once + the limit is reached." had to restructure a good bit of the error + handling to do this but it works for both endless headers and + endless content. + +2009-09-21 Seth Vidal + + * urlgrabber/grabber.py: make sure the value we get back from the + parse150 and other calls is converted to an int before we make it + 'size' rhbug: #524705 + +2009-09-02 Seth Vidal + + * urlgrabber/grabber.py: make file:// url not found msgs clearer and + hopefully fix a couple of ctrl-c issues. + +2009-08-27 Seth Vidal + + * urlgrabber/grabber.py: make proxy=_none_ properly disable all + proxies as per the docs + +2009-08-14 Seth Vidal + + * urlgrabber/grabber.py: - add full contingent of ssl options: - client keys - client + certs - capath/cainfo - client key passwords - client key and + cert types - verifypeer/verifyhost - add a number of common errors + to do_perform() - when an error is unknown, and doesn't make sense + report complete pycurl error code - when the filename is '' and not + None and we're doing a urlgrab() try to open the file anyway + rather than silently swallowing the data into a StringIO and + discarding it. + +2009-08-13 Seth Vidal + + * urlgrabber/grabber.py: add _to_utf8() method to pycurlfileobject + make sure postfield data is to_utf8'd before setting the option + otherwise pycurl is unhappy if the postfield data is a unicode + object instead of a string object. closes rh bug + https://bugzilla.redhat.com/show_bug.cgi?id=515797 + +2009-08-12 Seth Vidal + + * urlgrabber/grabber.py: initial pass at setting more advanced ssl + options. verify peer and verify host work as expected. + +2009-08-07 Seth Vidal + + * urlgrabber/grabber.py: keep from making tmpfiles all over /tmp on + any local file:// urlopen() by doing it in StringIO instead of + mkstemp(). Sort of fixes + https://bugzilla.redhat.com/show_bug.cgi?id=516178 + +2009-08-06 Seth Vidal + + * urlgrabber/grabber.py: - fix intrrupt handler and document why keyboardinterrupt is going + to be so weird in pycurl - disable signals and make sure we don't + handle/intercept any in the pycurl code. - set 'check_timestamp' + regets as NotImplemented. The work around is multiple connections. + it is possible but not immediately useful since, afaict, NOTHING + uses the check_timestamp regets. + +2009-08-05 Seth Vidal + + * urlgrabber/grabber.py: - make sure regets work when our filename is unicode - make sure we + are not resetting self.append = False when we don't need to + +2009-08-05 Seth Vidal + + * urlgrabber/grabber.py: - make sure we tell pycurl to get the filetime when downloading - + set a couple of options as 'True/False' instead of 1,0 - for + readability - make sure the option passed to timeout is an int - not + a string + +2009-08-04 Seth Vidal + + * urlgrabber/grabber.py: missed setting the value from opts.timeout + - doesn't really HURT what will happen b/c if your connect takes + longer than 5minutes then you're SCREWED + +2009-08-04 Seth Vidal + + * urlgrabber/grabber.py: handle timeouts more correctly (with the + exception) and set timeouts to be connect timeouts since libcurl + seems to actually honor timeouts - as opposed to urllib. closes rh + bug # 515497 + +2009-07-31 Seth Vidal + + * ChangeLog, makefile, urlgrabber/__init__.py: changelog + release + date touchup + +2009-07-31 Seth Vidal + + * makefile: add a few more things to be cleaned out + +2009-07-31 Seth Vidal + + * ChangeLog: update changelog + +2009-07-31 Seth Vidal + + * urlgrabber/grabber.py: - make readlines() work for mirrorlists in yum (which probably + shouldn't be using it anyway) - do a do_grab() in _do_open() which + may or may not be a good idea - I could also make the _do_grab() + happen when someone attempts to hit a method beyond the file object + open + +2009-07-30 Seth Vidal + + * urlgrabber/grabber.py: - make basic posts work + +2009-07-30 Seth Vidal + + * maint/git2cl: add git2cl + +2009-07-30 Seth Vidal + + * urlgrabber/grabber.py: when I first started this I hacked + something into URLGrabberFileObject - this reverts that hack + +2009-07-30 Seth Vidal + + * ChangeLog, maint/cvs2cl.pl, maint/usermap, test/runtests.py, + urlgrabber/__init__.py: - clean up some unused files - update the changelog - bump the + version - update the copyright in a couple of places + +2009-07-30 Seth Vidal + + * MANIFEST.in, makefile: - make makefile work again without using cvs - add makefile to + MANIFEST.in + +2009-07-30 Seth Vidal + + * urlgrabber/grabber.py: - make simple/most proxies work - remove unnnecessary 'have_range' + check for pycyurl obj + +2009-07-29 Seth Vidal + + * urlgrabber/grabber.py: - add range support - get rid of the .part file thing - it makes + range-regets harder than they need to be - make sure regets behave + +2009-07-29 Seth Vidal + + * urlgrabber/grabber.py: implement throttle/bandwidth controls in + pycurl tested with the progress call back - seems to work very well + + +2009-07-29 Seth Vidal + + * urlgrabber/grabber.py: get the content-length/size for ftp pkgs + too - steals parse150 from ftplib. Should work for A LOT of ftp + servers, but not all of them - add self.scheme for which protocol + we're using here. + +2009-07-29 James Antill + + * urlgrabber/byterange.py: Import fix for ftp ports in old urilib + code (probably worthless now, but meh) + +2009-07-29 James Antill + + * urlgrabber/progress.py: Import progress patches from Fedora. + These were done over a couple of years: . cleanup UI. . dynamic + terminal widths. . deal with serial console. . total download + stuff. + +2009-07-28 Seth Vidal + + * test/runtests.py, urlgrabber/grabber.py: implement + PyCurlFileObject. This makes the default and forklifts all the code + to pycurl. This is not finished but is functional for a significant + number of the tests. things known to be broken: - proxies - http + POST - non-header-based byte-ranges - certain types of read + operations when downloading a file to memory instead of to a + filename + +2009-05-15 Seth Vidal + + * urlgrabber/grabber.py: make it use *args instead of silly if + statements + +2009-05-15 Seth Vidal + + * urlgrabber/grabber.py: modify urlgraberror so it has a url + attribute and includes the url in all error messages. + +2006-12-12 mstenner + + * urlgrabber/grabber.py: more debugging code to expose options + +2006-12-08 mstenner + + * scripts/urlgrabber, test/test_grabber.py, urlgrabber/grabber.py, + urlgrabber/keepalive.py: lots of changes... improved clarity of + cached objects, improved debugging and logging, more options to the + urlgrabber script. + +2006-12-07 mstenner + + * scripts/urlgrabber, urlgrabber/grabber.py: Minor doc updates and + error handling in grabber.py. Complete rewrite of the urlgrabber + script. + +2006-12-05 mstenner + + * Minor fix to make byteranges work with some servers. _do_grab now + only reads as much as it needs to, rather than reading until the + server sends EOF. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3b20440 --- /dev/null +++ b/LICENSE @@ -0,0 +1,458 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..dd51f15 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,21 @@ +ChangeLog +LICENSE +MANIFEST +README +TODO +makefile +setup.py +scripts/urlgrabber +test/base_test_code.py +test/grabberperf.py +test/munittest.py +test/runtests.py +test/test_byterange.py +test/test_grabber.py +test/test_mirror.py +test/threading/batchgrabber.py +urlgrabber/__init__.py +urlgrabber/byterange.py +urlgrabber/grabber.py +urlgrabber/mirror.py +urlgrabber/progress.py diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..1368b10 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,48 @@ +Metadata-Version: 1.0 +Name: urlgrabber +Version: 3.9.1 +Summary: A high-level cross-protocol url-grabber +Home-page: http://linux.duke.edu/projects/urlgrabber/ +Author: Michael D. Stenner, Ryan Tomayko +Author-email: mstenner@linux.duke.edu, skvidal@fedoraproject.org +License: LGPL +Description: A high-level cross-protocol url-grabber. + + Using urlgrabber, data can be fetched in three basic ways: + + urlgrab(url) copy the file to the local filesystem + urlopen(url) open the remote file and return a file object + (like urllib2.urlopen) + urlread(url) return the contents of the file as a string + + When using these functions (or methods), urlgrabber supports the + following features: + + * identical behavior for http://, ftp://, and file:// urls + * http keepalive - faster downloads of many files by using + only a single connection + * byte ranges - fetch only a portion of the file + * reget - for a urlgrab, resume a partial download + * progress meters - the ability to report download progress + automatically, even when using urlopen! + * throttling - restrict bandwidth usage + * retries - automatically retry a download if it fails. The + number of retries and failure types are configurable. + * authenticated server access for http and ftp + * proxy support - support for authenticated http and ftp proxies + * mirror groups - treat a list of mirrors as a single source, + automatically switching mirrors if there is a failure. + +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console +Classifier: Environment :: Web Environment +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL) +Classifier: Operating System :: POSIX +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python +Classifier: Topic :: Internet :: File Transfer Protocol (FTP) +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Software Development :: Libraries :: Python Modules diff --git a/README b/README new file mode 100644 index 0000000..5fd378b --- /dev/null +++ b/README @@ -0,0 +1,31 @@ +urlgrabber -- A high-level cross-protocol url-grabber + +INSTALLATION INSTRUCTIONS + +If you want to install urlgrabber on your system, simply open the package +and run: + + python setup.py install + +Take a look at the install options by doing + + python setup.py install --help + +With no arguments, distutils will add all modules to a 'urlgrabber' +package under your python site-packages directory. + +You can build rpms by running + + python setup.py bdist_rpm + +The rpms (both source and "binary") will be specific to the current +distrubution/version and may not be portable to others. This is +because they will be built for the currently installed python. + +keepalive.py and byterange.py are generic urllib2 extension modules and +can be used to add keepalive and range support to any urllib2 +application. + +As of version 2.9.x, urlgrabber is no longer tested with python +versions less than 2.2. It will probably work with 2.0, but will +almost certainly NOT work under prior python versions. diff --git a/TODO b/TODO new file mode 100644 index 0000000..ad1dc8a --- /dev/null +++ b/TODO @@ -0,0 +1,50 @@ +ALPHA 2: + + * web page + - better examples page + + * threading/batch + - (rt) propose an interface for threaded batch downloads + - (mds) design a new progress-meter interface for threaded + multi-file downloads + - (rt) look at CacheFTPHandler and its implications for batch mode + and byte-ranges/reget + + * progress meter stuff + - support for retrying a file (in a MirrorGroup, for example) + - failure support (done?) + - support for when we have less information (no sizes, etc) + - check compatibility with gui interfaces + - starting a download with some parts already read (with reget, + for example) + + * look at making the 'check_timestamp' reget mode work with ftp. + Currently, we NEVER get a timestamp back, so we can't compare. + We'll probably need to subclass/replace either the urllib2 FTP handler + or the ftplib FTP object (or both, but I doubt it). It may or may not + be worth it just for this one mode of reget. It fails safely - by + getting the entire file. + + * cache dns lookups -- for a possible approach, see + https://lists.dulug.duke.edu/pipermail/yum-devel/2004-March/000136.html + +Misc/Maybe: + + * BatchURLGrabber/BatchMirrorGroup for concurrent downloads and possibly to + handle forking into secure/setuid sandbox. + + * Consider adding a progress_meter implementation that can be used in + concurrent download situations (I have some ideas about this -mds) + + * Consider using CacheFTPHandler instead of FTPHandler in byterange.py. + CacheFTPHandler reuses connections but this may lead to problems with + ranges. I've tested CacheFTPHandler with ranges using vsftpd as a + server and everything works fine but this needs more exhaustive tests + or a fallback mechanism. Also, CacheFTPHandler breaks with multiple + threads. + + * Consider some statistics tracking so that urlgrabber can record the + speed/reliability of different servers. This could then be used by + the mirror code for choosing optimal servers (slick, eh?) + + * check SSL certs. This may require PyOpenSSL. diff --git a/makefile b/makefile new file mode 100644 index 0000000..caa0f9e --- /dev/null +++ b/makefile @@ -0,0 +1,51 @@ +PACKAGE = urlgrabber +RM = /bin/rm -rf +GIT = /usr/bin/git +WEBHOST = login.dulug.duke.edu +WEBPATH = /home/groups/urlgrabber/web/download +PYTHON = python +PY_MODULE = $(PACKAGE) +SCM_MODULE = $(PACKAGE) +CLEANFILES = MANIFEST *~ build dist export release daily reference nonexistent_file ChangeLog.bak \ + *.pyc urlgrabber/*.pyc scripts/*.pyc test/*.pyc test/nonexistent_file \ + test/reference test/reference.part urlgrabber/*~ +############################################################################## +VERSION = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__version__') +DATE = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__date__') +SCM_TAG = release-$(shell echo $(VERSION) | sed -e 's/\./_/g') +PYTHON22 = $(shell /usr/bin/which python2.2 2>/dev/null) +PYTHON23 = $(shell /usr/bin/which python2.3 2>/dev/null) +PYTHON24 = $(shell /usr/bin/which python2.4 2>/dev/null) +PYTHON25 = $(shell /usr/bin/which python2.5 2>/dev/null) +TESTPYTHONS = $(PYTHON22) $(PYTHON23) $(PYTHON24) $(PYTHON25) +############################################################################## + +default: + @echo TARGETS: changelog release clean test + +changelog: + $(GIT) log --since=2006-12-01 --pretty --numstat --summary | maint/git2cl > ChangeLog + +# NOTE: do --manifest-only first even though we're about to force it. The +# former ensures that MANIFEST exists (touch would also do the trick). If +# the file 'MANIFEST' doesn't exist, then it won't be included the next time +# it's built from MANIFEST.in +release: FORCE pre-release-test + @dir=$$PWD; $(PYTHON) setup.py sdist --manifest-only + @dir=$$PWD; $(PYTHON) setup.py sdist --force-manifest + @echo "The archive is in dist/${PACKAGE}-$(VERSION).tar.gz" + +pre-release-test: + @echo "You should make sure you've updated the changelog" + @echo "version = $(VERSION), date = $(DATE), tag = $(SCM_TAG)" + test $(DATE) = `date +'%Y/%m/%d'` # verify release date is set to today + +clean: + $(RM) $(CLEANFILES) + +test: FORCE + @export PYTHONPATH=.; \ + $(PYTHON) test/runtests.py -v 1; \ + +FORCE: + diff --git a/packaging/python-urlgrabber.changes b/packaging/python-urlgrabber.changes new file mode 100644 index 0000000..e81b8ac --- /dev/null +++ b/packaging/python-urlgrabber.changes @@ -0,0 +1,101 @@ +* Tue Sep 06 2011 William Douglas - 3.9.1 +- Fix missing prefix + +* Fri Mar 11 2011 Yan Li - 3.9.1 +- When libproxy fails, fall back to environment variables (BMC#13757) + +* Sat Jan 30 2010 Jian-feng Ding 3.9.1 +- Upgrade to 3.9.1 and enable spectacle + Dropped all old patches except David's libproxy one + Integreted the updated version of libproxy patch, found in: + https://bugzilla.redhat.com/show_bug.cgi?id=542224 + +* Sat Dec 26 2009 David Woodhouse 3.1.0 +- Use libproxy to find proxy information + +* Sun Apr 12 2009 Peter Zhu 3.1.0 +- Add dependency to m2crypto so that it can works with https connection fixed bug #1289 + + +* Fri Feb 20 2009 Zhu Yanhai 3.1.0 +- Correct SOURCE +- Version update to 3.1.0 + +* Mon Apr 7 2008 James Antill 3.0.0-6 +- Fix the ftp byterange port problem: +- Resolves: bug#419241 +- Fixup the progress UI: +- add function for total progress +- add total progress percentagee current download line +- add rate to current download line +- use dead space when finished downloading +- don't confuse download rate on regets. + +* Sat Mar 15 2008 Robert Scheck 3.0.0-5 +- Make sure, that *.egg-info is catched up during build + +* Mon Dec 3 2007 Jeremy Katz - 3.0.0-4 +- Ensure fds are closed on exceptions (markmc, #404211) + +* Wed Oct 10 2007 Jeremy Katz - 3.0.0-3 +- fix type checking of strings to also include unicode strings; fixes + regets from yum (#235618) + +* Mon Aug 27 2007 Jeremy Katz - 3.0.0-2 +- fixes for package review (#226347) + +* Thu May 31 2007 Jeremy Katz - 3.0.0-1 +- update to 3.0.0 + +* Wed Dec 6 2006 Jeremy Katz - 2.9.9-5 +- rebuild for python 2.5 + +* Wed Dec 6 2006 Jeremy Katz - 2.9.9-4 +- fix keepalive (#218268) + +* Sat Nov 11 2006 Florian La Roche +- add version/release to "Provides: urlgrabber" + +* Mon Jul 17 2006 James Bowes - 2.9.9-2 +- Add support for byte ranges and keepalive over HTTPS + +* Wed Jul 12 2006 Jesse Keating - 2.9.9-1.1 +- rebuild + +* Tue May 16 2006 Jeremy Katz - 2.9.9-1 +- update to 2.9.9 + +* Tue Mar 14 2006 Jeremy Katz - 2.9.8-2 +- catch read errors so they trigger the failure callback. helps catch bad cds + +* Wed Feb 22 2006 Jeremy Katz - 2.9.8-1 +- update to new version fixing progress bars in yum on regets + +* Fri Dec 09 2005 Jesse Keating +- rebuilt + +* Wed Sep 21 2005 Jeremy Katz - 2.9.6-4 +- don't use --record and list files by hand so that we don't miss + directories (#158480) + +* Wed Sep 14 2005 Jeremy Katz - 2.9.6-3 +- add directory to file list (#168261) + +* Fri Jun 03 2005 Phil Knirsch 2.9.6-2 +- Fixed the reget method to actually work correctly (skip completely transfered + files, etc) + +* Tue Mar 8 2005 Jeremy Katz - 2.9.6-1 +- update to 2.9.6 + +* Mon Mar 7 2005 Jeremy Katz - 2.9.5-1 +- import into dist +- make the description less of a book + +* Mon Mar 7 2005 Seth Vidal 2.9.5-0 +- 2.9.5 + +* Thu Feb 24 2005 Seth Vidal 2.9.3-0 +- first package for fc3 +- named python-urlgrabber for naming guideline compliance + diff --git a/packaging/python-urlgrabber.spec b/packaging/python-urlgrabber.spec new file mode 100644 index 0000000..d7631fa --- /dev/null +++ b/packaging/python-urlgrabber.spec @@ -0,0 +1,60 @@ + +%{!?python_sitearch: %define python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)")} +Name: python-urlgrabber +Summary: A high-level cross-protocol url-grabber +Version: 3.9.1 +Release: 1 +Group: Development/Libraries +License: LGPLv2+ +BuildArch: noarch +URL: http://urlgrabber.baseurl.org/ +Source0: http://urlgrabber.baseurl.org/download/urlgrabber-%{version}.tar.gz +Patch0: urlgrabber-HEAD.patch +Patch1: urlgrabber-libproxy-httponly.patch +Requires: python-pycurl +Requires: m2crypto +Requires: libproxy-python +BuildRequires: python-devel +BuildRequires: python-pycurl +Provides: urlgrabber = %{version}-%{release} + +BuildRoot: %{_tmppath}/%{name}-%{version}-build + +%description +A high-level cross-protocol url-grabber for python supporting HTTP, FTP +and file locations. Features include keepalive, byte ranges, throttling, +authentication, proxies and more. + + + + +%prep +%setup -q -n urlgrabber-%{version} +%patch0 -p1 +%patch1 -p1 + +%build + +CFLAGS="$RPM_OPT_FLAGS" %{__python} setup.py build + +%install +rm -rf %{buildroot} +%{__python} setup.py install --root=%{buildroot} -O1 --prefix=%{_prefix} + +rm -rf $RPM_BUILD_ROOT/%{_docdir}/urlgrabber-%{version} + +%clean +rm -rf %{buildroot} + + + + + + +%files +%defattr(-,root,root,-) +%doc ChangeLog LICENSE README TODO +%{python_sitelib}/urlgrabber* +%{_bindir}/urlgrabber + + diff --git a/packaging/urlgrabber-HEAD.patch b/packaging/urlgrabber-HEAD.patch new file mode 100644 index 0000000..90180d2 --- /dev/null +++ b/packaging/urlgrabber-HEAD.patch @@ -0,0 +1,142 @@ +diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py +index e090e90..a26880c 100644 +--- a/urlgrabber/grabber.py ++++ b/urlgrabber/grabber.py +@@ -439,6 +439,12 @@ try: + except: + __version__ = '???' + ++try: ++ # this part isn't going to do much - need to talk to gettext ++ from i18n import _ ++except ImportError, msg: ++ def _(st): return st ++ + ######################################################################## + # functions for debugging output. These functions are here because they + # are also part of the module initialization. +@@ -1052,7 +1058,8 @@ class PyCurlFileObject(): + self._reget_length = 0 + self._prog_running = False + self._error = (None, None) +- self.size = None ++ self.size = 0 ++ self._hdr_ended = False + self._do_open() + + +@@ -1085,9 +1092,14 @@ class PyCurlFileObject(): + return -1 + + def _hdr_retrieve(self, buf): ++ if self._hdr_ended: ++ self._hdr_dump = '' ++ self.size = 0 ++ self._hdr_ended = False ++ + if self._over_max_size(cur=len(self._hdr_dump), + max_size=self.opts.max_header_size): +- return -1 ++ return -1 + try: + self._hdr_dump += buf + # we have to get the size before we do the progress obj start +@@ -1104,7 +1116,17 @@ class PyCurlFileObject(): + s = parse150(buf) + if s: + self.size = int(s) +- ++ ++ if buf.lower().find('location') != -1: ++ location = ':'.join(buf.split(':')[1:]) ++ location = location.strip() ++ self.scheme = urlparse.urlsplit(location)[0] ++ self.url = location ++ ++ if len(self._hdr_dump) != 0 and buf == '\r\n': ++ self._hdr_ended = True ++ if DEBUG: DEBUG.info('header ended:') ++ + return len(buf) + except KeyboardInterrupt: + return pycurl.READFUNC_ABORT +@@ -1136,6 +1158,7 @@ class PyCurlFileObject(): + self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) + self.curl_obj.setopt(pycurl.FAILONERROR, True) + self.curl_obj.setopt(pycurl.OPT_FILETIME, True) ++ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) + + if DEBUG: + self.curl_obj.setopt(pycurl.VERBOSE, True) +@@ -1291,7 +1314,12 @@ class PyCurlFileObject(): + raise err + + elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it +- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) ++ if self.scheme in ['http', 'https']: ++ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) ++ elif self.scheme in ['ftp']: ++ msg = 'FTP Error %s : %s ' % (self.http_code, self.url) ++ else: ++ msg = "Unknown Error: URL=%s , scheme=%s" % (self.url, self.scheme) + else: + msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) + code = errcode +@@ -1299,6 +1327,12 @@ class PyCurlFileObject(): + err.code = code + err.exception = e + raise err ++ else: ++ if self._error[1]: ++ msg = self._error[1] ++ err = URLGRabError(14, msg) ++ err.url = self.url ++ raise err + + def _do_open(self): + self.curl_obj = _curl_cache +@@ -1532,11 +1566,14 @@ class PyCurlFileObject(): + def _over_max_size(self, cur, max_size=None): + + if not max_size: +- max_size = self.size +- if self.opts.size: # if we set an opts size use that, no matter what +- max_size = self.opts.size ++ if not self.opts.size: ++ max_size = self.size ++ else: ++ max_size = self.opts.size ++ + if not max_size: return False # if we have None for all of the Max then this is dumb +- if cur > max_size + max_size*.10: ++ ++ if cur > int(float(max_size) * 1.10): + + msg = _("Downloaded more than max size for %s: %s > %s") \ + % (self.url, cur, max_size) +@@ -1582,7 +1619,11 @@ class PyCurlFileObject(): + self.opts.progress_obj.end(self._amount_read) + self.fo.close() + +- ++ def geturl(self): ++ """ Provide the geturl() method, used to be got from ++ urllib.addinfourl, via. urllib.URLopener.* """ ++ return self.url ++ + _curl_cache = pycurl.Curl() # make one and reuse it over and over and over + + +diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py +index dd07c6a..45eb248 100644 +--- a/urlgrabber/progress.py ++++ b/urlgrabber/progress.py +@@ -658,6 +658,8 @@ def format_time(seconds, use_hours=0): + if seconds is None or seconds < 0: + if use_hours: return '--:--:--' + else: return '--:--' ++ elif seconds == float('inf'): ++ return 'Infinite' + else: + seconds = int(seconds) + minutes = seconds / 60 diff --git a/packaging/urlgrabber-libproxy-httponly.patch b/packaging/urlgrabber-libproxy-httponly.patch new file mode 100644 index 0000000..ffed3ba --- /dev/null +++ b/packaging/urlgrabber-libproxy-httponly.patch @@ -0,0 +1,83 @@ +From 2dc04b08536ad0a17147fc91eacb3dc8e87a755b Mon Sep 17 00:00:00 2001 +Message-Id: <2dc04b08536ad0a17147fc91eacb3dc8e87a755b.1299826695.git.yan.i.li@intel.com> +From: Yan Li +Date: Fri, 11 Mar 2011 14:49:28 +0800 +Subject: [PATCH] libproxy support + +Based on David Woodhouse 's work. + +Signed-off-by: Yan Li +--- + urlgrabber/grabber.py | 35 ++++++++++++++++++++++++++++++++--- + 1 files changed, 32 insertions(+), 3 deletions(-) + +diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py +index e090e90..a4e3e04 100644 +--- a/urlgrabber/grabber.py ++++ b/urlgrabber/grabber.py +@@ -143,8 +143,17 @@ GENERAL ARGUMENTS (kwargs) + note that proxy authentication information may be provided using + normal URL constructs: + proxies={ 'http' : 'http://user:host@foo:3128' } +- Lastly, if proxies is None, the default environment settings will +- be used. ++ If proxies is None, the proxy_factory (described below) will be used. ++ ++ proxy_factory = libproxy.ProxyFactory() ++ ++ a libproxy ProxyFactory object. This is initialised to a default ++ global ProxyFactory if libproxy is installed, but can be ++ overridden to None to disable libproxy, or indeed to anything with ++ a getProxies() method that takes a URL and returns a list of ++ potential proxies. The proxy_factory is is only used if the ++ proxies dictionary is not set. If both proxies and proxy_factory ++ are None, the default environment variable will be used. + + prefix = None + +@@ -439,6 +448,12 @@ try: + except: + __version__ = '???' + ++try: ++ import libproxy ++ _grabber_proxy_factory = libproxy.ProxyFactory() ++except: ++ _grabber_proxy_factory = None ++ + ######################################################################## + # functions for debugging output. These functions are here because they + # are also part of the module initialization. +@@ -802,6 +817,7 @@ class URLGrabberOptions: + self.user_agent = 'urlgrabber/%s' % __version__ + self.keepalive = 1 + self.proxies = None ++ self.proxy_factory = _grabber_proxy_factory + self.reget = None + self.failure_callback = None + self.interrupt_callback = None +@@ -1202,7 +1218,20 @@ class PyCurlFileObject(): + else: + if proxy == '_none_': proxy = "" + self.curl_obj.setopt(pycurl.PROXY, proxy) +- ++ elif opts.proxy_factory: ++ try: ++ proxies = opts.proxy_factory.getProxies(self.url); ++ for proxy in proxies: ++ if proxy.startswith('http://'): ++ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % \ ++ (proxy, self.url)) ++ self.curl_obj.setopt(pycurl.PROXY, proxy) ++ break ++ except: ++ # libproxy may fail, and in that case we just fall ++ # back to next proxy supplier (environment variables) ++ opts.proxy_factory = None ++ + # FIXME username/password/auth settings + + #posts - simple - expects the fields as they are +-- +1.7.4.1 + diff --git a/scripts/urlgrabber b/scripts/urlgrabber new file mode 100644 index 0000000..518e512 --- /dev/null +++ b/scripts/urlgrabber @@ -0,0 +1,329 @@ +#!/usr/bin/python -t + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko + +"""NAME + urlgrabber - a simple client for the urlgrabber python package + +DESCRIPTION + This is a thin client for the urlgrabber python package. It is + provided mainly for helping debug the python package. It provides + low-level access to most urlgrabber features from the shell. + + There are two types of options available for this program. They are + 'client options' and 'module options'. Client options apply + specifically to the behavior of this client, whereas module options + are built-in options to the urlgrabber module. Both of these are + avaible from the client command line, but they're documented a + little differently. Client options are documented here, and module + options are documented through the '--help