diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/ChangeLog | 363 | ||||
-rw-r--r-- | src/Makefile.am | 4 | ||||
-rw-r--r-- | src/Makefile.in | 230 | ||||
-rw-r--r-- | src/build_info.c | 2 | ||||
-rw-r--r-- | src/build_info.c.in | 2 | ||||
-rw-r--r-- | src/config.h.in | 329 | ||||
-rw-r--r-- | src/connect.c | 41 | ||||
-rw-r--r-- | src/connect.h | 1 | ||||
-rw-r--r-- | src/convert.c | 9 | ||||
-rw-r--r-- | src/convert.h | 4 | ||||
-rw-r--r-- | src/cookies.c | 11 | ||||
-rw-r--r-- | src/css-url.c | 3 | ||||
-rw-r--r-- | src/css-url.h | 1 | ||||
-rw-r--r-- | src/exits.c | 8 | ||||
-rw-r--r-- | src/exits.h | 4 | ||||
-rw-r--r-- | src/ftp-basic.c | 10 | ||||
-rw-r--r-- | src/ftp.c | 126 | ||||
-rw-r--r-- | src/gnutls.c | 178 | ||||
-rw-r--r-- | src/hash.c | 6 | ||||
-rw-r--r-- | src/hash.h | 2 | ||||
-rw-r--r-- | src/host.c | 17 | ||||
-rw-r--r-- | src/host.h | 4 | ||||
-rw-r--r-- | src/html-parse.c | 6 | ||||
-rw-r--r-- | src/html-url.c | 9 | ||||
-rw-r--r-- | src/http.c | 592 | ||||
-rw-r--r-- | src/init.c | 129 | ||||
-rw-r--r-- | src/log.c | 66 | ||||
-rw-r--r-- | src/log.h | 4 | ||||
-rw-r--r-- | src/main.c | 221 | ||||
-rw-r--r-- | src/openssl.c | 23 | ||||
-rw-r--r-- | src/options.h | 25 | ||||
-rw-r--r-- | src/progress.c | 15 | ||||
-rw-r--r-- | src/ptimer.c | 4 | ||||
-rw-r--r-- | src/recur.c | 5 | ||||
-rw-r--r-- | src/retr.c | 103 | ||||
-rw-r--r-- | src/retr.h | 2 | ||||
-rw-r--r-- | src/spider.c | 2 | ||||
-rw-r--r-- | src/ssl.h | 4 | ||||
-rw-r--r-- | src/test.c | 2 | ||||
-rw-r--r-- | src/url.c | 11 | ||||
-rw-r--r-- | src/utils.c | 121 | ||||
-rw-r--r-- | src/utils.h | 10 | ||||
-rw-r--r-- | src/warc.c | 1440 | ||||
-rw-r--r-- | src/warc.h | 23 | ||||
-rw-r--r-- | src/wget.h | 4 |
45 files changed, 3756 insertions, 420 deletions
diff --git a/src/ChangeLog b/src/ChangeLog index 8eae78f..8fcd0bf 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,366 @@ +2012-07-03 Steven Schubiger <stsc@member.fsf.org> + + * init.c: Include warc.h for warc_close in cleanup function. + +2012-07-08 Steven Schubiger <stsc@member.fsf.org> + + * exits.h: Fix comment. + * exits.c: Likewise. + +2012-07-07 Tim Ruehsen <tim.ruehsen@gmx.de> + + (digest_authentication_encode): Add support for RFC 2617 Digest + Access Authentication. + +2012-07-07 Giuseppe Scrivano <gscrivano@gnu.org> + + * http.c (http_loop): Fix log message. + * main.c (main): Likewise. + Reported by: Petr Pisar <petr.pisar@atlas.cz> + +2012-06-17 Giuseppe Scrivano <gscrivano@gnu.org> + + * wget.h: Define `CLOSEFAILED'. + * init.c: Include "exits.h". + (cleanup): Check `fclose' failure. + * exits.c (get_status_for_err): Handle `CLOSEFAILED'. + +2012-06-16 Giuseppe Scrivano <gscrivano@gnu.org> + + * main.c (main): Move some cleanup related function to... + * init.c (cleanup): ...here. + + * main.c: Do not include "stdout.h". + (main): Do not register `close_stdout' at exit. + Reported by: Micah Cowan <micah@cowan.name>. + +2012-06-09 Giuseppe Scrivano <gscrivano@gnu.org> + + * main.c (print_help): Move --report-speed under the section + "Logging and input file". + +2012-06-06 Giuseppe Scrivano <gscrivano@gnu.org> + + * main.c (print_help): Rename --bits to --report-bps. + (cmdline_options): Likewise. + * init.c (commands): Rename --report-bps to --report-speed. + (cmd_spec_report_speed): New function. + + * options.h (struct options): Rename `bits_fmt' to `report_bps'. + * main.c (print_help): Rename --bits to --report-bps. + (cmdline_options): Likewise. + * init.c (commands): Likewise + + * progress.c (create_image): Adjust caller. + * retr.c (retr_rate): Likewise. + * utils.c (convert_to_bits): Likewise. + +2012-06-04 Tim Ruehsen <tim.ruehsen@gmx.de> + + * main.c (main): Check for filename != NULL. + * warc.c (warc_process_cdx_line): Fix memory leak. + * utils.c (match_posix_regex, compile_posix_regex): Remove dead + assignment. + * openssl.c (ssl_init): Fix old-style function definition. + +2012-06-02 Giuseppe Scrivano <gscrivano@gnu.org> + + * connect.c: Include <sys/socket.h> and <sys/select.h>. + +2012-05-30 Gijs van Tulder <gvtulder@gmail.com> + + * warc.c: Fix segfault if CDX record is not found. + +2011-05-26 Steven Schweda <sms@antinode.info> + * connect.c [HAVE_SYS_SOCKET_H]: Include <sys/socket.h>. + [HAVE_SYS_SELECT_H]: Include <sys/select.h>. + +2012-05-26 Mike Frysinger <vapier@gentoo.org> + + * warc.c: Change type of `warc_current_gzfile' to gzFile. + +2012-05-26 Giuseppe Scrivano <gscrivano@gnu.org> + + * warc.c (warc_load_cdx_dedup_file): Change type of `line_length' to + ssize_t. + Suggested by: Ángel González <keisial@gmail.com> + +2012-05-18 Tim Ruehsen <tim.ruehsen@gmx.de> + + * gnutls.c (wgnutls_poll): Honor the specified `timeout' value. + (wgnutls_peek): Likewise. + +2012-05-19 illusionoflife <illusion.of.life92@gmail.com> (tiny change) + + * convert.c (register_html,register_css): Fixed functions signature to + not accept unused argument + * retr.c (retrieve_url): Changed register_{css,html} usage according + new signature. + +2012-05-16 Giuseppe Scrivano <gscrivano@gnu.org> + + * warc.h: Cut length lines to 80 columns. + * warc.c: Likewise. + +2012-05-14 Tim Ruehsen <tim.ruehsen@gmx.de> + + * gnutls.c (wgnutls_read_timeout): removed warnings, moved fcntl stuff + outside loop. + + * hash.h (hash_table_put): Make argument "value" const. + * hash.c (hash_table_put): Make argument value const. Cast `value' to + void. + * http.c (request_set_header): Make argument `name' const. Cast `value' + and `name' to void*. + (request_remove_header): Make argument `name' const. + * url.c (url_file_name): Make `index_filename' static. + * warc.h (warc_write_cdx_record): Make `url', `timestamp', `mime_type', + `payload_digest', `redirect_location', `warc_filename', response_uuid' + arguments const. Make `checksum' const. + * warc.c (warc_write_date_header): Make the `timestamp' argument const. + Make `extension' const. + (warc_write_cdx_record): Make `url', `timestamp', `mime_type', + `payload_digest', `redirect_location', `warc_filename', response_uuid' + arguments const. Make `checksum' const. + +2012-05-13 Tim Ruehsen <tim.ruehsen@gmx.de> + + * gnutls.c (credentials): Change type to + gnutls_certificate_credentials_t. + (ssl_init): Do not use deprecated types. + (ssl_connect_wget): Likewise. + +2012-04-11 Gijs van Tulder <gvtulder@gmail.com> + + * init.c: Add --accept-regex, --reject-regex and --regex-type. + * main.c: Likewise. + * options.c: Likewise. + * recur.c: Likewise. + * utils.c: Add regex-related functions. + * utils.h: Add regex-related functions. + +2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de> + + * convert.c (convert_links_in_hashtable): Mmake it static. + * cookies.c (parse_set_cookie): Remove empty else branches. + * css-url.c: Include "css-url.h". + (get_uri_string): Make it static. + * css-url.h (get_urls_css): Add protoype. + * gnutls.c (ssl_init): Add prototype. + * html-parse.c (tagstack_push): Make it static. + * html-parse.c (tagstack_pop): Make it static. + * html-parse.c (tagstack_find): Make it static. + * html-url.c (cleanup_html_url): Make it static. + * progress.c (count_cols): Make it static. + * progress.c (get_eta): Make it static. + * retr.h (convert_to_bits): Remove prototype. + * util.h (convert_to_bits): Add prototype. + * spider.c (spider_cleanup): Make it static. + * warc.c (warc_write_start_record): Add prototype. + * warc.c (warc_write_end_record): Add prototype. + * warc.c (warc_start_cdx_file): Add prototype. + * warc.c (warc_init): Add prototype. + * warc.c (warc_load_cdx_dedup_file): Add prototype. + * warc.c (warc_write_metadata): Add prototype. + * warc.c (warc_close): Add prototype. + * warc.c (warc_tempfile): Add prototype. + * warc.c (warc_write_warcinfo_record): Make it static. + * warc.c (warc_load_cdx_dedup_file): Make it static. + * warc.c (warc_write_metadata): Make it static. + * warc.h (warc_init): Fix prototype. + * warc.h (warc_close): Fix prototype. + * warc.h (warc_tempfile): Fix prototype. + +2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de> + + * url.c: Use empty query in local filenames. + +2012-04-22 Tim Ruehsen <tim.ruehsen@gmx.de> + + * main.c (main): Dynamically allocate `opt.progress_type'. + +2012-04-21 Tim Ruehsen <tim.ruehsen@gmx.de> + + * ftp-basic.c (ftp_pasv): Fix memory leak. + + * http.c (gethttp): Fix memory leak. + + * ftp.c (getftp): Silent compiler warning. + +2009-06-14 Phil Pennock <mutt-dev@spodhuis.org> (tiny change) + * host.h: Declare `is_valid_ip_address'. + * host.c (is_valid_ip_address): New function. + * http.c (gethttp): Specify the hostname to ssl_connect_wget. + * gnutls.c (ssl_connect_wget): Specify the server name. + * openssl.c (ssl_connect_wget): Likewise. + * ssl.h: Change method signature for ssl_connect_wget. + +2012-04-13 Tim Ruehsen <tim.ruehsen@gmx.de> (tiny change) + + * warc.c (warc_load_cdx_dedup_file): Fix a memory leak by freeing + `lineptr'. + +2012-04-07 Daniel Kahn Gillmor <dkg@fifthhorseman.net> (tiny change) + + * gnutls.c (key_type_to_gnutls_type): New function. + (ssl_init): Use correctly the specified gnutls certificate. + +2012-04-01 Gijs van Tulder <gvtulder@gmail.com> + + * html-url.c: Prevent crash on incomplete STYLE tag. + +2012-04-01 Giuseppe Scrivano <gscrivano@gnu.org> + + * gnutls.c (wgnutls_read_timeout): Ensure timer is freed. + + * gnutls.c (wgnutls_read_timeout): Do not use timer if it is not + allocated. + Reported by: Xu Zhongxing <xu_zhong_xing@163.com> + +2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de> (tiny change) + + * warc.c: make warc_uuid_str() implementation depend on HAVE_LIBUUID. + +2012-03-29 Tim Ruehsen <tim.ruehsen@gmx.de> (tiny change) + + * utils.c (library): Include <sys/time.h>. + +2012-03-25 Giuseppe Scrivano <gscrivano@gnu.org> + + * utils.c: Include <sys/ioctl.h>. + + * ptimer.c: Include <sys/time.h>. + + * connect.c: Include <sys/socket.h>, <sys/select.h>, <sys/time.h>. + Reported by: Ray Satiro <raysatiro@yahoo.com>. + +2012-03-25 Ray Satiro <raysatiro@yahoo.com> + + * build_info.c.in: Check that HAVE_LIBSSL32 is defined when OpenSSL + is used. + +2012-03-07 Steven Schubiger <stsc@member.fsf.org> + + * init.c (wgetrc_user_file_name): Correct typo. + +2012-03-06 Sasikantha Babu <sasikanth.v19@gmail.com> + + * utils.c (convert_to_bits): Added new function convert_to_bits to + convert bytes to bits. + * retr.c (calc_rate): Modified the function to handle --bits + option and download rate calculated as bits per sec (SI-prefix) + for --bits otherwise bytes (IEC-prefix). + (retr_rate): Rates will display in bits per sec for --bits. + * options.h (struct opt): Added --bit option bool variable bits_fmt. + * main.c (print_help) : Added help for --bit. + * init.c: Defined command for --bit option. + * retr.h: Added function prototype. + +2012-02-26 Giuseppe Scrivano <gscrivano@gnu.org> + + * main.c: Include "closeout.h" + (main): Register close_stdout at exit. + +2012-02-01 Gijs van Tulder <gvtulder@gmail.com> + + * warc.c: Fix large file support with ftello, fseeko. + * warc.h: Fix large file support. + * http.c: Fix large file support. + +2012-02-23 Giuseppe Scrivano <giuseppe@southpole.se> + + * main.c (main): Write diagnostic messages to `stderr' not to `stdout'. + + * main.c (main): Fail gracefully if `malloc' fails. + + * gnutls.c (wgnutls_read): Remove unused variables `timer' and `flags'. + +2012-02-17 Steven Schubiger <stsc@member.fsf.org> + + * warc.c: Add license header. + +2012-01-27 Gijs van Tulder <gvtulder@gmail.com> + + * retr.c (fd_read_body): If the response is chunked, the chunk + headers are now written to the WARC file, making the WARC file + an exact copy of the HTTP response. + +2012-01-27 Gijs van Tulder <gvtulder@gmail.com> + + * retr.c (fd_read_body): Fix a memory leak with chunked responses. + * http.c (skip_short_body): Fix the same memory leak. + +2012-01-09 Gijs van Tulder <gvtulder@gmail.com> + + * init.c: Disable WARC compression if zlib is disabled. + * main.c: Do not show the 'no-warc-compression' option if zlib is + disabled. + * warc.c: Do not compress WARC files if zlib is disabled. + +2012-01-09 Sasikantha Babu <sasikanth.v19@gmail.com> (tiny change) + * connect.c (connect_to_ip): properly formatted ipv6 address display. + (socket_family): New function - returns socket family type. + * http.c (gethttp): properly formatted ipv6 address display. + +2011-11-09 Gijs van Tulder <address@hidden> + + * warc.c: Call gzdopen() with wb9 instead of wb+9, which fails on + zlib version >= 1.2.4. + +2011-11-04 Steven Schweda <address@hidden> + + * warc.c [! WINDOWS]: Include <libgen.h>. + (warc_write_warcinfo_record): Assign a new allocated buffer and + free it on errors. + +2011-11-01 Steven Schweda <address@hidden> + + * gnutls.c (ssl_init): Ensure GNU TLS is loaded only once. + +2011-10-07 Steven Schweda <address@hidden> + + * connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions + on includes of <sys/select.h> and <sys/socket.h>, respectively. + * ftp.c (getftp): Move BIN_TYPE_TRANSFER macro into VMS-specific + section. On VMS, use Stream_LF attributes for listing files. Pass + BIN_TYPE_FILE to fopen_excl() instead of constant-everywhere "true". + * ftp.c (ftp_retrieve_list): Restore lost test of opt.preserve_perm + (--preserve-permissions) on the chmod() operation. + * init.c, main.c: Remove "deprecated" from opt.preserve_perm + (--preserve-permissions). + * init.c (initialize): Use distinct messages for errors in C macro + SYSTEM_WGETRC and environment-variable SYSTEM_WGETRC. Avoid use of + C macro SYSTEM_WGETRC when it's not defined. + * log.c (log_close): Avoid closing logfp when it's stderr. + * main.c (print_help): Restore --preserve-permissions. + * main.c (main): Avoid using a negative value of longindex as a + subscript (for long_options[]) when searching for "--config". + * main.c (main): Exit the program using exit() instead of "return". + (VMS handles these differently, and exit() is better.) + * openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const" + "meth" argument to accommodate OpenSSL version 0.9.8, where that + argument is not "const" in the OpenSSL function (SSL_CTX_new). + * test.c: Declare "program_argstring". + * utils.c (fopen_excl): Comment typography. + * warc.h: New file. + * warc.c: New file. + +2011-10-02 Henrik Holst <henrik.holst@millistream.com> (tiny change) + * http.c (gethttp): If 'contentonerror' is used then do not + skip the http body on 4xx and 5xx errors. + + * init.c (commands): Add 'contentonerror'. + + * main.c (print_help, option_data): Add new option 'contentonerror' + to make wget not skip the http content on 4xx and 5xx errors. + + * options.h: New variable 'content_on_error'. + +2011-09-19 Giuseppe Scrivano <gscrivano@gnu.org> + + * main.c (print_version): Update copyright year. + (print_version): Fix typo. + 2011-09-13 Giuseppe Scrivano <gscrivano@gnu.org> * ftp.c (ftp_retrieve_glob): Propagate correctly the `res' error diff --git a/src/Makefile.am b/src/Makefile.am index 6b95198..8ef931a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -46,13 +46,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c \ css_.c css-url.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ - recur.c res.c retr.c spider.c url.c \ + recur.c res.c retr.c spider.c url.c warc.c \ utils.c exits.c build_info.c $(IRI_OBJ) \ css-url.h css-tokens.h connect.h convert.h cookies.h \ ftp.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ - spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \ + spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h \ exits.h gettext.h nodist_wget_SOURCES = version.c EXTRA_wget_SOURCES = iri.c diff --git a/src/Makefile.in b/src/Makefile.in index 2998df2..dc0b856 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -1,9 +1,9 @@ -# Makefile.in generated by automake 1.11.1 from Makefile.am. +# Makefile.in generated by automake 1.11.6 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, -# 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, -# Inc. +# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software +# Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -49,6 +49,23 @@ # VPATH = @srcdir@ +am__make_dryrun = \ + { \ + am__dry=no; \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ + | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ + *) \ + for am__flg in $$MAKEFLAGS; do \ + case $$am__flg in \ + *=*|--*) ;; \ + *n*) am__dry=yes; break;; \ + esac; \ + done;; \ + esac; \ + test $$am__dry = yes; \ + } pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ @@ -74,76 +91,89 @@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/00gnulib.m4 \ $(top_srcdir)/m4/alloca.m4 $(top_srcdir)/m4/arpa_inet_h.m4 \ - $(top_srcdir)/m4/asm-underscore.m4 \ - $(top_srcdir)/m4/clock_time.m4 $(top_srcdir)/m4/close.m4 \ - $(top_srcdir)/m4/codeset.m4 $(top_srcdir)/m4/configmake.m4 \ - $(top_srcdir)/m4/dirname.m4 \ + $(top_srcdir)/m4/asm-underscore.m4 $(top_srcdir)/m4/base32.m4 \ + $(top_srcdir)/m4/btowc.m4 $(top_srcdir)/m4/clock_time.m4 \ + $(top_srcdir)/m4/close.m4 $(top_srcdir)/m4/codeset.m4 \ + $(top_srcdir)/m4/configmake.m4 $(top_srcdir)/m4/dirname.m4 \ $(top_srcdir)/m4/double-slash-root.m4 $(top_srcdir)/m4/dup2.m4 \ $(top_srcdir)/m4/environ.m4 $(top_srcdir)/m4/errno_h.m4 \ - $(top_srcdir)/m4/error.m4 $(top_srcdir)/m4/extensions.m4 \ + $(top_srcdir)/m4/error.m4 $(top_srcdir)/m4/exponentd.m4 \ + $(top_srcdir)/m4/extensions.m4 \ + $(top_srcdir)/m4/extern-inline.m4 \ $(top_srcdir)/m4/fatal-signal.m4 $(top_srcdir)/m4/fcntl-o.m4 \ $(top_srcdir)/m4/fcntl.m4 $(top_srcdir)/m4/fcntl_h.m4 \ $(top_srcdir)/m4/float_h.m4 $(top_srcdir)/m4/fseek.m4 \ - $(top_srcdir)/m4/fseeko.m4 $(top_srcdir)/m4/futimens.m4 \ - $(top_srcdir)/m4/getaddrinfo.m4 $(top_srcdir)/m4/getdelim.m4 \ - $(top_srcdir)/m4/getdtablesize.m4 $(top_srcdir)/m4/getline.m4 \ - $(top_srcdir)/m4/getopt.m4 $(top_srcdir)/m4/getpass.m4 \ - $(top_srcdir)/m4/gettext.m4 $(top_srcdir)/m4/gettime.m4 \ - $(top_srcdir)/m4/gettimeofday.m4 $(top_srcdir)/m4/glibc21.m4 \ - $(top_srcdir)/m4/gnulib-common.m4 \ + $(top_srcdir)/m4/fseeko.m4 $(top_srcdir)/m4/fstat.m4 \ + $(top_srcdir)/m4/ftell.m4 $(top_srcdir)/m4/ftello.m4 \ + $(top_srcdir)/m4/futimens.m4 $(top_srcdir)/m4/getaddrinfo.m4 \ + $(top_srcdir)/m4/getdelim.m4 $(top_srcdir)/m4/getdtablesize.m4 \ + $(top_srcdir)/m4/getline.m4 $(top_srcdir)/m4/getopt.m4 \ + $(top_srcdir)/m4/getpass.m4 $(top_srcdir)/m4/gettext.m4 \ + $(top_srcdir)/m4/gettime.m4 $(top_srcdir)/m4/gettimeofday.m4 \ + $(top_srcdir)/m4/glibc21.m4 $(top_srcdir)/m4/gnulib-common.m4 \ $(top_srcdir)/m4/gnulib-comp.m4 $(top_srcdir)/m4/hostent.m4 \ $(top_srcdir)/m4/iconv.m4 $(top_srcdir)/m4/iconv_h.m4 \ $(top_srcdir)/m4/include_next.m4 $(top_srcdir)/m4/inet_ntop.m4 \ $(top_srcdir)/m4/inline.m4 $(top_srcdir)/m4/intlmacosx.m4 \ $(top_srcdir)/m4/intmax_t.m4 $(top_srcdir)/m4/inttypes_h.m4 \ - $(top_srcdir)/m4/ioctl.m4 $(top_srcdir)/m4/largefile.m4 \ - $(top_srcdir)/m4/lib-ld.m4 $(top_srcdir)/m4/lib-link.m4 \ - $(top_srcdir)/m4/lib-prefix.m4 \ + $(top_srcdir)/m4/ioctl.m4 $(top_srcdir)/m4/langinfo_h.m4 \ + $(top_srcdir)/m4/largefile.m4 $(top_srcdir)/m4/lib-ld.m4 \ + $(top_srcdir)/m4/lib-link.m4 $(top_srcdir)/m4/lib-prefix.m4 \ $(top_srcdir)/m4/localcharset.m4 $(top_srcdir)/m4/locale-fr.m4 \ $(top_srcdir)/m4/locale-ja.m4 $(top_srcdir)/m4/locale-zh.m4 \ + $(top_srcdir)/m4/locale_h.m4 $(top_srcdir)/m4/localeconv.m4 \ $(top_srcdir)/m4/lock.m4 $(top_srcdir)/m4/longlong.m4 \ $(top_srcdir)/m4/lseek.m4 $(top_srcdir)/m4/lstat.m4 \ $(top_srcdir)/m4/malloc.m4 $(top_srcdir)/m4/mbrtowc.m4 \ $(top_srcdir)/m4/mbsinit.m4 $(top_srcdir)/m4/mbstate_t.m4 \ $(top_srcdir)/m4/mbtowc.m4 $(top_srcdir)/m4/md5.m4 \ $(top_srcdir)/m4/memchr.m4 $(top_srcdir)/m4/mkdir.m4 \ - $(top_srcdir)/m4/mmap-anon.m4 $(top_srcdir)/m4/mode_t.m4 \ - $(top_srcdir)/m4/multiarch.m4 $(top_srcdir)/m4/netdb_h.m4 \ - $(top_srcdir)/m4/netinet_in_h.m4 $(top_srcdir)/m4/nls.m4 \ - $(top_srcdir)/m4/nocrash.m4 $(top_srcdir)/m4/open.m4 \ + $(top_srcdir)/m4/mkstemp.m4 $(top_srcdir)/m4/mmap-anon.m4 \ + $(top_srcdir)/m4/mode_t.m4 $(top_srcdir)/m4/msvc-inval.m4 \ + $(top_srcdir)/m4/msvc-nothrow.m4 $(top_srcdir)/m4/multiarch.m4 \ + $(top_srcdir)/m4/netdb_h.m4 $(top_srcdir)/m4/netinet_in_h.m4 \ + $(top_srcdir)/m4/nl_langinfo.m4 $(top_srcdir)/m4/nls.m4 \ + $(top_srcdir)/m4/nocrash.m4 $(top_srcdir)/m4/off_t.m4 \ + $(top_srcdir)/m4/open.m4 $(top_srcdir)/m4/pathmax.m4 \ $(top_srcdir)/m4/pipe2.m4 $(top_srcdir)/m4/po.m4 \ $(top_srcdir)/m4/posix_spawn.m4 $(top_srcdir)/m4/printf.m4 \ $(top_srcdir)/m4/quote.m4 $(top_srcdir)/m4/quotearg.m4 \ - $(top_srcdir)/m4/rawmemchr.m4 $(top_srcdir)/m4/realloc.m4 \ + $(top_srcdir)/m4/raise.m4 $(top_srcdir)/m4/rawmemchr.m4 \ + $(top_srcdir)/m4/realloc.m4 $(top_srcdir)/m4/regex.m4 \ $(top_srcdir)/m4/sched_h.m4 $(top_srcdir)/m4/select.m4 \ - $(top_srcdir)/m4/servent.m4 $(top_srcdir)/m4/sig_atomic_t.m4 \ - $(top_srcdir)/m4/sigaction.m4 $(top_srcdir)/m4/signal_h.m4 \ + $(top_srcdir)/m4/servent.m4 $(top_srcdir)/m4/sha1.m4 \ + $(top_srcdir)/m4/sig_atomic_t.m4 $(top_srcdir)/m4/sigaction.m4 \ + $(top_srcdir)/m4/signal_h.m4 \ $(top_srcdir)/m4/signalblocking.m4 $(top_srcdir)/m4/sigpipe.m4 \ $(top_srcdir)/m4/size_max.m4 $(top_srcdir)/m4/snprintf.m4 \ $(top_srcdir)/m4/socketlib.m4 $(top_srcdir)/m4/sockets.m4 \ $(top_srcdir)/m4/socklen.m4 $(top_srcdir)/m4/sockpfaf.m4 \ $(top_srcdir)/m4/spawn-pipe.m4 $(top_srcdir)/m4/spawn_h.m4 \ - $(top_srcdir)/m4/stat-time.m4 $(top_srcdir)/m4/stat.m4 \ + $(top_srcdir)/m4/ssize_t.m4 $(top_srcdir)/m4/stat-time.m4 \ + $(top_srcdir)/m4/stat.m4 $(top_srcdir)/m4/stdalign.m4 \ $(top_srcdir)/m4/stdbool.m4 $(top_srcdir)/m4/stddef_h.m4 \ $(top_srcdir)/m4/stdint.m4 $(top_srcdir)/m4/stdint_h.m4 \ $(top_srcdir)/m4/stdio_h.m4 $(top_srcdir)/m4/stdlib_h.m4 \ $(top_srcdir)/m4/strcase.m4 $(top_srcdir)/m4/strcasestr.m4 \ $(top_srcdir)/m4/strchrnul.m4 $(top_srcdir)/m4/strerror.m4 \ $(top_srcdir)/m4/strerror_r.m4 $(top_srcdir)/m4/string_h.m4 \ - $(top_srcdir)/m4/strings_h.m4 $(top_srcdir)/m4/sys_ioctl_h.m4 \ + $(top_srcdir)/m4/strings_h.m4 $(top_srcdir)/m4/strtok_r.m4 \ + $(top_srcdir)/m4/sys_ioctl_h.m4 \ $(top_srcdir)/m4/sys_select_h.m4 \ $(top_srcdir)/m4/sys_socket_h.m4 \ $(top_srcdir)/m4/sys_stat_h.m4 $(top_srcdir)/m4/sys_time_h.m4 \ $(top_srcdir)/m4/sys_types_h.m4 $(top_srcdir)/m4/sys_uio_h.m4 \ - $(top_srcdir)/m4/sys_wait_h.m4 $(top_srcdir)/m4/threadlib.m4 \ - $(top_srcdir)/m4/time_h.m4 $(top_srcdir)/m4/timespec.m4 \ + $(top_srcdir)/m4/sys_wait_h.m4 $(top_srcdir)/m4/tempname.m4 \ + $(top_srcdir)/m4/threadlib.m4 $(top_srcdir)/m4/time_h.m4 \ + $(top_srcdir)/m4/timespec.m4 $(top_srcdir)/m4/tmpdir.m4 \ $(top_srcdir)/m4/unistd-safer.m4 $(top_srcdir)/m4/unistd_h.m4 \ $(top_srcdir)/m4/unlocked-io.m4 $(top_srcdir)/m4/utimbuf.m4 \ $(top_srcdir)/m4/utimens.m4 $(top_srcdir)/m4/utimes.m4 \ $(top_srcdir)/m4/vasnprintf.m4 $(top_srcdir)/m4/vasprintf.m4 \ - $(top_srcdir)/m4/wait-process.m4 $(top_srcdir)/m4/waitpid.m4 \ - $(top_srcdir)/m4/warn-on-use.m4 $(top_srcdir)/m4/wchar_h.m4 \ - $(top_srcdir)/m4/wchar_t.m4 $(top_srcdir)/m4/wctype_h.m4 \ + $(top_srcdir)/m4/vsnprintf.m4 $(top_srcdir)/m4/wait-process.m4 \ + $(top_srcdir)/m4/waitpid.m4 $(top_srcdir)/m4/warn-on-use.m4 \ + $(top_srcdir)/m4/wchar_h.m4 $(top_srcdir)/m4/wchar_t.m4 \ + $(top_srcdir)/m4/wcrtomb.m4 $(top_srcdir)/m4/wctype_h.m4 \ $(top_srcdir)/m4/wget.m4 $(top_srcdir)/m4/wint_t.m4 \ $(top_srcdir)/m4/write.m4 $(top_srcdir)/m4/xalloc.m4 \ $(top_srcdir)/m4/xsize.m4 $(top_srcdir)/configure.ac @@ -158,13 +188,13 @@ libunittest_a_DEPENDENCIES = $(LIBOBJS) am__libunittest_a_SOURCES_DIST = cmpt.c connect.c convert.c cookies.c \ ftp.c css_.c css-url.c ftp-basic.c ftp-ls.c hash.c host.c \ html-parse.c html-url.c http.c init.c log.c main.c netrc.c \ - progress.c ptimer.c recur.c res.c retr.c spider.c url.c \ + progress.c ptimer.c recur.c res.c retr.c spider.c url.c warc.c \ utils.c exits.c build_info.c iri.c css-url.h css-tokens.h \ connect.h convert.h cookies.h ftp.h hash.h host.h html-parse.h \ html-url.h http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h spider.h \ - ssl.h sysdep.h url.h utils.h wget.h iri.h exits.h gettext.h \ - test.c test.h + ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h exits.h \ + gettext.h test.c test.h @IRI_IS_ENABLED_TRUE@am__objects_1 = libunittest_a-iri.$(OBJEXT) am__objects_2 = libunittest_a-cmpt.$(OBJEXT) \ libunittest_a-connect.$(OBJEXT) \ @@ -182,7 +212,8 @@ am__objects_2 = libunittest_a-cmpt.$(OBJEXT) \ libunittest_a-ptimer.$(OBJEXT) libunittest_a-recur.$(OBJEXT) \ libunittest_a-res.$(OBJEXT) libunittest_a-retr.$(OBJEXT) \ libunittest_a-spider.$(OBJEXT) libunittest_a-url.$(OBJEXT) \ - libunittest_a-utils.$(OBJEXT) libunittest_a-exits.$(OBJEXT) \ + libunittest_a-warc.$(OBJEXT) libunittest_a-utils.$(OBJEXT) \ + libunittest_a-exits.$(OBJEXT) \ libunittest_a-build_info.$(OBJEXT) $(am__objects_1) am_libunittest_a_OBJECTS = $(am__objects_2) \ libunittest_a-test.$(OBJEXT) \ @@ -195,12 +226,13 @@ PROGRAMS = $(bin_PROGRAMS) am__wget_SOURCES_DIST = cmpt.c connect.c convert.c cookies.c ftp.c \ css_.c css-url.c ftp-basic.c ftp-ls.c hash.c host.c \ html-parse.c html-url.c http.c init.c log.c main.c netrc.c \ - progress.c ptimer.c recur.c res.c retr.c spider.c url.c \ + progress.c ptimer.c recur.c res.c retr.c spider.c url.c warc.c \ utils.c exits.c build_info.c iri.c css-url.h css-tokens.h \ connect.h convert.h cookies.h ftp.h hash.h host.h html-parse.h \ html-url.h http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h spider.h \ - ssl.h sysdep.h url.h utils.h wget.h iri.h exits.h gettext.h + ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h exits.h \ + gettext.h @IRI_IS_ENABLED_TRUE@am__objects_3 = iri.$(OBJEXT) am_wget_OBJECTS = cmpt.$(OBJEXT) connect.$(OBJEXT) convert.$(OBJEXT) \ cookies.$(OBJEXT) ftp.$(OBJEXT) css_.$(OBJEXT) \ @@ -209,8 +241,8 @@ am_wget_OBJECTS = cmpt.$(OBJEXT) connect.$(OBJEXT) convert.$(OBJEXT) \ html-url.$(OBJEXT) http.$(OBJEXT) init.$(OBJEXT) log.$(OBJEXT) \ main.$(OBJEXT) netrc.$(OBJEXT) progress.$(OBJEXT) \ ptimer.$(OBJEXT) recur.$(OBJEXT) res.$(OBJEXT) retr.$(OBJEXT) \ - spider.$(OBJEXT) url.$(OBJEXT) utils.$(OBJEXT) exits.$(OBJEXT) \ - build_info.$(OBJEXT) $(am__objects_3) + spider.$(OBJEXT) url.$(OBJEXT) warc.$(OBJEXT) utils.$(OBJEXT) \ + exits.$(OBJEXT) build_info.$(OBJEXT) $(am__objects_3) nodist_wget_OBJECTS = version.$(OBJEXT) wget_OBJECTS = $(am_wget_OBJECTS) $(nodist_wget_OBJECTS) wget_LDADD = $(LDADD) @@ -227,6 +259,11 @@ SOURCES = $(libunittest_a_SOURCES) $(nodist_libunittest_a_SOURCES) \ $(wget_SOURCES) $(EXTRA_wget_SOURCES) $(nodist_wget_SOURCES) DIST_SOURCES = $(am__libunittest_a_SOURCES_DIST) \ $(am__wget_SOURCES_DIST) $(EXTRA_wget_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) @@ -286,12 +323,15 @@ GNULIB_BIND = @GNULIB_BIND@ GNULIB_BTOWC = @GNULIB_BTOWC@ GNULIB_CALLOC_POSIX = @GNULIB_CALLOC_POSIX@ GNULIB_CANONICALIZE_FILE_NAME = @GNULIB_CANONICALIZE_FILE_NAME@ +GNULIB_CHDIR = @GNULIB_CHDIR@ GNULIB_CHOWN = @GNULIB_CHOWN@ GNULIB_CLOSE = @GNULIB_CLOSE@ GNULIB_CONNECT = @GNULIB_CONNECT@ GNULIB_DPRINTF = @GNULIB_DPRINTF@ +GNULIB_DUP = @GNULIB_DUP@ GNULIB_DUP2 = @GNULIB_DUP2@ GNULIB_DUP3 = @GNULIB_DUP3@ +GNULIB_DUPLOCALE = @GNULIB_DUPLOCALE@ GNULIB_ENVIRON = @GNULIB_ENVIRON@ GNULIB_EUIDACCESS = @GNULIB_EUIDACCESS@ GNULIB_FACCESSAT = @GNULIB_FACCESSAT@ @@ -300,6 +340,8 @@ GNULIB_FCHMODAT = @GNULIB_FCHMODAT@ GNULIB_FCHOWNAT = @GNULIB_FCHOWNAT@ GNULIB_FCLOSE = @GNULIB_FCLOSE@ GNULIB_FCNTL = @GNULIB_FCNTL@ +GNULIB_FDATASYNC = @GNULIB_FDATASYNC@ +GNULIB_FDOPEN = @GNULIB_FDOPEN@ GNULIB_FFLUSH = @GNULIB_FFLUSH@ GNULIB_FFS = @GNULIB_FFS@ GNULIB_FFSL = @GNULIB_FFSL@ @@ -317,6 +359,7 @@ GNULIB_FREOPEN = @GNULIB_FREOPEN@ GNULIB_FSCANF = @GNULIB_FSCANF@ GNULIB_FSEEK = @GNULIB_FSEEK@ GNULIB_FSEEKO = @GNULIB_FSEEKO@ +GNULIB_FSTAT = @GNULIB_FSTAT@ GNULIB_FSTATAT = @GNULIB_FSTATAT@ GNULIB_FSYNC = @GNULIB_FSYNC@ GNULIB_FTELL = @GNULIB_FTELL@ @@ -339,18 +382,19 @@ GNULIB_GETLOGIN = @GNULIB_GETLOGIN@ GNULIB_GETLOGIN_R = @GNULIB_GETLOGIN_R@ GNULIB_GETPAGESIZE = @GNULIB_GETPAGESIZE@ GNULIB_GETPEERNAME = @GNULIB_GETPEERNAME@ -GNULIB_GETS = @GNULIB_GETS@ GNULIB_GETSOCKNAME = @GNULIB_GETSOCKNAME@ GNULIB_GETSOCKOPT = @GNULIB_GETSOCKOPT@ GNULIB_GETSUBOPT = @GNULIB_GETSUBOPT@ GNULIB_GETTIMEOFDAY = @GNULIB_GETTIMEOFDAY@ GNULIB_GETUSERSHELL = @GNULIB_GETUSERSHELL@ +GNULIB_GL_UNISTD_H_GETOPT = @GNULIB_GL_UNISTD_H_GETOPT@ GNULIB_GRANTPT = @GNULIB_GRANTPT@ GNULIB_GROUP_MEMBER = @GNULIB_GROUP_MEMBER@ GNULIB_ICONV = @GNULIB_ICONV@ GNULIB_INET_NTOP = @GNULIB_INET_NTOP@ GNULIB_INET_PTON = @GNULIB_INET_PTON@ GNULIB_IOCTL = @GNULIB_IOCTL@ +GNULIB_ISATTY = @GNULIB_ISATTY@ GNULIB_ISWBLANK = @GNULIB_ISWBLANK@ GNULIB_ISWCTYPE = @GNULIB_ISWCTYPE@ GNULIB_LCHMOD = @GNULIB_LCHMOD@ @@ -358,6 +402,7 @@ GNULIB_LCHOWN = @GNULIB_LCHOWN@ GNULIB_LINK = @GNULIB_LINK@ GNULIB_LINKAT = @GNULIB_LINKAT@ GNULIB_LISTEN = @GNULIB_LISTEN@ +GNULIB_LOCALECONV = @GNULIB_LOCALECONV@ GNULIB_LSEEK = @GNULIB_LSEEK@ GNULIB_LSTAT = @GNULIB_LSTAT@ GNULIB_MALLOC_POSIX = @GNULIB_MALLOC_POSIX@ @@ -397,15 +442,18 @@ GNULIB_MKSTEMP = @GNULIB_MKSTEMP@ GNULIB_MKSTEMPS = @GNULIB_MKSTEMPS@ GNULIB_MKTIME = @GNULIB_MKTIME@ GNULIB_NANOSLEEP = @GNULIB_NANOSLEEP@ +GNULIB_NL_LANGINFO = @GNULIB_NL_LANGINFO@ GNULIB_NONBLOCKING = @GNULIB_NONBLOCKING@ GNULIB_OBSTACK_PRINTF = @GNULIB_OBSTACK_PRINTF@ GNULIB_OBSTACK_PRINTF_POSIX = @GNULIB_OBSTACK_PRINTF_POSIX@ GNULIB_OPEN = @GNULIB_OPEN@ GNULIB_OPENAT = @GNULIB_OPENAT@ +GNULIB_PCLOSE = @GNULIB_PCLOSE@ GNULIB_PERROR = @GNULIB_PERROR@ GNULIB_PIPE = @GNULIB_PIPE@ GNULIB_PIPE2 = @GNULIB_PIPE2@ GNULIB_POPEN = @GNULIB_POPEN@ +GNULIB_POSIX_OPENPT = @GNULIB_POSIX_OPENPT@ GNULIB_POSIX_SPAWN = @GNULIB_POSIX_SPAWN@ GNULIB_POSIX_SPAWNATTR_DESTROY = @GNULIB_POSIX_SPAWNATTR_DESTROY@ GNULIB_POSIX_SPAWNATTR_GETFLAGS = @GNULIB_POSIX_SPAWNATTR_GETFLAGS@ @@ -433,11 +481,14 @@ GNULIB_PRINTF_POSIX = @GNULIB_PRINTF_POSIX@ GNULIB_PSELECT = @GNULIB_PSELECT@ GNULIB_PTHREAD_SIGMASK = @GNULIB_PTHREAD_SIGMASK@ GNULIB_PTSNAME = @GNULIB_PTSNAME@ +GNULIB_PTSNAME_R = @GNULIB_PTSNAME_R@ GNULIB_PUTC = @GNULIB_PUTC@ GNULIB_PUTCHAR = @GNULIB_PUTCHAR@ GNULIB_PUTENV = @GNULIB_PUTENV@ GNULIB_PUTS = @GNULIB_PUTS@ GNULIB_PWRITE = @GNULIB_PWRITE@ +GNULIB_RAISE = @GNULIB_RAISE@ +GNULIB_RANDOM = @GNULIB_RANDOM@ GNULIB_RANDOM_R = @GNULIB_RANDOM_R@ GNULIB_RAWMEMCHR = @GNULIB_RAWMEMCHR@ GNULIB_READ = @GNULIB_READ@ @@ -457,6 +508,8 @@ GNULIB_SELECT = @GNULIB_SELECT@ GNULIB_SEND = @GNULIB_SEND@ GNULIB_SENDTO = @GNULIB_SENDTO@ GNULIB_SETENV = @GNULIB_SETENV@ +GNULIB_SETHOSTNAME = @GNULIB_SETHOSTNAME@ +GNULIB_SETLOCALE = @GNULIB_SETLOCALE@ GNULIB_SETSOCKOPT = @GNULIB_SETSOCKOPT@ GNULIB_SHUTDOWN = @GNULIB_SHUTDOWN@ GNULIB_SIGACTION = @GNULIB_SIGACTION@ @@ -497,7 +550,6 @@ GNULIB_TIME_R = @GNULIB_TIME_R@ GNULIB_TMPFILE = @GNULIB_TMPFILE@ GNULIB_TOWCTRANS = @GNULIB_TOWCTRANS@ GNULIB_TTYNAME_R = @GNULIB_TTYNAME_R@ -GNULIB_UNISTD_H_GETOPT = @GNULIB_UNISTD_H_GETOPT@ GNULIB_UNISTD_H_NONBLOCKING = @GNULIB_UNISTD_H_NONBLOCKING@ GNULIB_UNISTD_H_SIGPIPE = @GNULIB_UNISTD_H_SIGPIPE@ GNULIB_UNLINK = @GNULIB_UNLINK@ @@ -564,6 +616,7 @@ HAVE_CANONICALIZE_FILE_NAME = @HAVE_CANONICALIZE_FILE_NAME@ HAVE_CHOWN = @HAVE_CHOWN@ HAVE_DECL_ENVIRON = @HAVE_DECL_ENVIRON@ HAVE_DECL_FCHDIR = @HAVE_DECL_FCHDIR@ +HAVE_DECL_FDATASYNC = @HAVE_DECL_FDATASYNC@ HAVE_DECL_FPURGE = @HAVE_DECL_FPURGE@ HAVE_DECL_FREEADDRINFO = @HAVE_DECL_FREEADDRINFO@ HAVE_DECL_FSEEKO = @HAVE_DECL_FSEEKO@ @@ -585,6 +638,7 @@ HAVE_DECL_MEMMEM = @HAVE_DECL_MEMMEM@ HAVE_DECL_MEMRCHR = @HAVE_DECL_MEMRCHR@ HAVE_DECL_OBSTACK_PRINTF = @HAVE_DECL_OBSTACK_PRINTF@ HAVE_DECL_SETENV = @HAVE_DECL_SETENV@ +HAVE_DECL_SETHOSTNAME = @HAVE_DECL_SETHOSTNAME@ HAVE_DECL_SNPRINTF = @HAVE_DECL_SNPRINTF@ HAVE_DECL_STRDUP = @HAVE_DECL_STRDUP@ HAVE_DECL_STRERROR_R = @HAVE_DECL_STRERROR_R@ @@ -601,12 +655,14 @@ HAVE_DECL_WCWIDTH = @HAVE_DECL_WCWIDTH@ HAVE_DPRINTF = @HAVE_DPRINTF@ HAVE_DUP2 = @HAVE_DUP2@ HAVE_DUP3 = @HAVE_DUP3@ +HAVE_DUPLOCALE = @HAVE_DUPLOCALE@ HAVE_EUIDACCESS = @HAVE_EUIDACCESS@ HAVE_FACCESSAT = @HAVE_FACCESSAT@ HAVE_FCHDIR = @HAVE_FCHDIR@ HAVE_FCHMODAT = @HAVE_FCHMODAT@ HAVE_FCHOWNAT = @HAVE_FCHOWNAT@ HAVE_FCNTL = @HAVE_FCNTL@ +HAVE_FDATASYNC = @HAVE_FDATASYNC@ HAVE_FEATURES_H = @HAVE_FEATURES_H@ HAVE_FFS = @HAVE_FFS@ HAVE_FFSL = @HAVE_FFSL@ @@ -630,6 +686,11 @@ HAVE_GROUP_MEMBER = @HAVE_GROUP_MEMBER@ HAVE_INTTYPES_H = @HAVE_INTTYPES_H@ HAVE_ISWBLANK = @HAVE_ISWBLANK@ HAVE_ISWCNTRL = @HAVE_ISWCNTRL@ +HAVE_LANGINFO_CODESET = @HAVE_LANGINFO_CODESET@ +HAVE_LANGINFO_ERA = @HAVE_LANGINFO_ERA@ +HAVE_LANGINFO_H = @HAVE_LANGINFO_H@ +HAVE_LANGINFO_T_FMT_AMPM = @HAVE_LANGINFO_T_FMT_AMPM@ +HAVE_LANGINFO_YESEXPR = @HAVE_LANGINFO_YESEXPR@ HAVE_LCHMOD = @HAVE_LCHMOD@ HAVE_LCHOWN = @HAVE_LCHOWN@ HAVE_LIBGNUTLS = @HAVE_LIBGNUTLS@ @@ -656,13 +717,18 @@ HAVE_MKOSTEMP = @HAVE_MKOSTEMP@ HAVE_MKOSTEMPS = @HAVE_MKOSTEMPS@ HAVE_MKSTEMP = @HAVE_MKSTEMP@ HAVE_MKSTEMPS = @HAVE_MKSTEMPS@ +HAVE_MSVC_INVALID_PARAMETER_HANDLER = @HAVE_MSVC_INVALID_PARAMETER_HANDLER@ HAVE_NANOSLEEP = @HAVE_NANOSLEEP@ HAVE_NETDB_H = @HAVE_NETDB_H@ HAVE_NETINET_IN_H = @HAVE_NETINET_IN_H@ +HAVE_NL_LANGINFO = @HAVE_NL_LANGINFO@ HAVE_OPENAT = @HAVE_OPENAT@ HAVE_OS_H = @HAVE_OS_H@ +HAVE_PCLOSE = @HAVE_PCLOSE@ HAVE_PIPE = @HAVE_PIPE@ HAVE_PIPE2 = @HAVE_PIPE2@ +HAVE_POPEN = @HAVE_POPEN@ +HAVE_POSIX_OPENPT = @HAVE_POSIX_OPENPT@ HAVE_POSIX_SIGNALBLOCKING = @HAVE_POSIX_SIGNALBLOCKING@ HAVE_POSIX_SPAWN = @HAVE_POSIX_SPAWN@ HAVE_POSIX_SPAWNATTR_T = @HAVE_POSIX_SPAWNATTR_T@ @@ -671,7 +737,10 @@ HAVE_PREAD = @HAVE_PREAD@ HAVE_PSELECT = @HAVE_PSELECT@ HAVE_PTHREAD_SIGMASK = @HAVE_PTHREAD_SIGMASK@ HAVE_PTSNAME = @HAVE_PTSNAME@ +HAVE_PTSNAME_R = @HAVE_PTSNAME_R@ HAVE_PWRITE = @HAVE_PWRITE@ +HAVE_RAISE = @HAVE_RAISE@ +HAVE_RANDOM = @HAVE_RANDOM@ HAVE_RANDOM_H = @HAVE_RANDOM_H@ HAVE_RANDOM_R = @HAVE_RANDOM_R@ HAVE_RAWMEMCHR = @HAVE_RAWMEMCHR@ @@ -683,6 +752,7 @@ HAVE_RPMATCH = @HAVE_RPMATCH@ HAVE_SA_FAMILY_T = @HAVE_SA_FAMILY_T@ HAVE_SCHED_H = @HAVE_SCHED_H@ HAVE_SETENV = @HAVE_SETENV@ +HAVE_SETHOSTNAME = @HAVE_SETHOSTNAME@ HAVE_SIGACTION = @HAVE_SIGACTION@ HAVE_SIGHANDLER_T = @HAVE_SIGHANDLER_T@ HAVE_SIGINFO_T = @HAVE_SIGINFO_T@ @@ -774,6 +844,7 @@ HAVE_WMEMCPY = @HAVE_WMEMCPY@ HAVE_WMEMMOVE = @HAVE_WMEMMOVE@ HAVE_WMEMSET = @HAVE_WMEMSET@ HAVE_WS2TCPIP_H = @HAVE_WS2TCPIP_H@ +HAVE_XLOCALE_H = @HAVE_XLOCALE_H@ HAVE__BOOL = @HAVE__BOOL@ HAVE__EXIT = @HAVE__EXIT@ HOSTENT_LIB = @HOSTENT_LIB@ @@ -809,7 +880,9 @@ LIBSSL = @LIBSSL@ LIBSSL_PREFIX = @LIBSSL_PREFIX@ LIBTHREAD = @LIBTHREAD@ LIB_CLOCK_GETTIME = @LIB_CLOCK_GETTIME@ +LIB_SELECT = @LIB_SELECT@ LOCALCHARSET_TESTS_ENVIRONMENT = @LOCALCHARSET_TESTS_ENVIRONMENT@ +LOCALE_FR = @LOCALE_FR@ LOCALE_FR_UTF8 = @LOCALE_FR_UTF8@ LOCALE_JA = @LOCALE_JA@ LOCALE_ZH_CN = @LOCALE_ZH_CN@ @@ -834,6 +907,8 @@ NEXT_AS_FIRST_DIRECTIVE_FCNTL_H = @NEXT_AS_FIRST_DIRECTIVE_FCNTL_H@ NEXT_AS_FIRST_DIRECTIVE_FLOAT_H = @NEXT_AS_FIRST_DIRECTIVE_FLOAT_H@ NEXT_AS_FIRST_DIRECTIVE_GETOPT_H = @NEXT_AS_FIRST_DIRECTIVE_GETOPT_H@ NEXT_AS_FIRST_DIRECTIVE_ICONV_H = @NEXT_AS_FIRST_DIRECTIVE_ICONV_H@ +NEXT_AS_FIRST_DIRECTIVE_LANGINFO_H = @NEXT_AS_FIRST_DIRECTIVE_LANGINFO_H@ +NEXT_AS_FIRST_DIRECTIVE_LOCALE_H = @NEXT_AS_FIRST_DIRECTIVE_LOCALE_H@ NEXT_AS_FIRST_DIRECTIVE_NETDB_H = @NEXT_AS_FIRST_DIRECTIVE_NETDB_H@ NEXT_AS_FIRST_DIRECTIVE_NETINET_IN_H = @NEXT_AS_FIRST_DIRECTIVE_NETINET_IN_H@ NEXT_AS_FIRST_DIRECTIVE_SCHED_H = @NEXT_AS_FIRST_DIRECTIVE_SCHED_H@ @@ -862,6 +937,8 @@ NEXT_FCNTL_H = @NEXT_FCNTL_H@ NEXT_FLOAT_H = @NEXT_FLOAT_H@ NEXT_GETOPT_H = @NEXT_GETOPT_H@ NEXT_ICONV_H = @NEXT_ICONV_H@ +NEXT_LANGINFO_H = @NEXT_LANGINFO_H@ +NEXT_LOCALE_H = @NEXT_LOCALE_H@ NEXT_NETDB_H = @NEXT_NETDB_H@ NEXT_NETINET_IN_H = @NEXT_NETINET_IN_H@ NEXT_SCHED_H = @NEXT_SCHED_H@ @@ -910,9 +987,11 @@ REPLACE_CLOSE = @REPLACE_CLOSE@ REPLACE_DPRINTF = @REPLACE_DPRINTF@ REPLACE_DUP = @REPLACE_DUP@ REPLACE_DUP2 = @REPLACE_DUP2@ +REPLACE_DUPLOCALE = @REPLACE_DUPLOCALE@ REPLACE_FCHOWNAT = @REPLACE_FCHOWNAT@ REPLACE_FCLOSE = @REPLACE_FCLOSE@ REPLACE_FCNTL = @REPLACE_FCNTL@ +REPLACE_FDOPEN = @REPLACE_FDOPEN@ REPLACE_FFLUSH = @REPLACE_FFLUSH@ REPLACE_FOPEN = @REPLACE_FOPEN@ REPLACE_FPRINTF = @REPLACE_FPRINTF@ @@ -924,6 +1003,7 @@ REPLACE_FSTAT = @REPLACE_FSTAT@ REPLACE_FSTATAT = @REPLACE_FSTATAT@ REPLACE_FTELL = @REPLACE_FTELL@ REPLACE_FTELLO = @REPLACE_FTELLO@ +REPLACE_FTRUNCATE = @REPLACE_FTRUNCATE@ REPLACE_FUTIMENS = @REPLACE_FUTIMENS@ REPLACE_GAI_STRERROR = @REPLACE_GAI_STRERROR@ REPLACE_GETCWD = @REPLACE_GETCWD@ @@ -937,12 +1017,17 @@ REPLACE_GETTIMEOFDAY = @REPLACE_GETTIMEOFDAY@ REPLACE_ICONV = @REPLACE_ICONV@ REPLACE_ICONV_OPEN = @REPLACE_ICONV_OPEN@ REPLACE_ICONV_UTF = @REPLACE_ICONV_UTF@ +REPLACE_INET_NTOP = @REPLACE_INET_NTOP@ +REPLACE_INET_PTON = @REPLACE_INET_PTON@ REPLACE_IOCTL = @REPLACE_IOCTL@ +REPLACE_ISATTY = @REPLACE_ISATTY@ REPLACE_ISWBLANK = @REPLACE_ISWBLANK@ REPLACE_ISWCNTRL = @REPLACE_ISWCNTRL@ +REPLACE_ITOLD = @REPLACE_ITOLD@ REPLACE_LCHOWN = @REPLACE_LCHOWN@ REPLACE_LINK = @REPLACE_LINK@ REPLACE_LINKAT = @REPLACE_LINKAT@ +REPLACE_LOCALECONV = @REPLACE_LOCALECONV@ REPLACE_LOCALTIME_R = @REPLACE_LOCALTIME_R@ REPLACE_LSEEK = @REPLACE_LSEEK@ REPLACE_LSTAT = @REPLACE_LSTAT@ @@ -962,6 +1047,7 @@ REPLACE_MKNOD = @REPLACE_MKNOD@ REPLACE_MKSTEMP = @REPLACE_MKSTEMP@ REPLACE_MKTIME = @REPLACE_MKTIME@ REPLACE_NANOSLEEP = @REPLACE_NANOSLEEP@ +REPLACE_NL_LANGINFO = @REPLACE_NL_LANGINFO@ REPLACE_NULL = @REPLACE_NULL@ REPLACE_OBSTACK_PRINTF = @REPLACE_OBSTACK_PRINTF@ REPLACE_OPEN = @REPLACE_OPEN@ @@ -969,12 +1055,18 @@ REPLACE_OPENAT = @REPLACE_OPENAT@ REPLACE_PERROR = @REPLACE_PERROR@ REPLACE_POPEN = @REPLACE_POPEN@ REPLACE_POSIX_SPAWN = @REPLACE_POSIX_SPAWN@ +REPLACE_POSIX_SPAWN_FILE_ACTIONS_ADDCLOSE = @REPLACE_POSIX_SPAWN_FILE_ACTIONS_ADDCLOSE@ +REPLACE_POSIX_SPAWN_FILE_ACTIONS_ADDDUP2 = @REPLACE_POSIX_SPAWN_FILE_ACTIONS_ADDDUP2@ +REPLACE_POSIX_SPAWN_FILE_ACTIONS_ADDOPEN = @REPLACE_POSIX_SPAWN_FILE_ACTIONS_ADDOPEN@ REPLACE_PREAD = @REPLACE_PREAD@ REPLACE_PRINTF = @REPLACE_PRINTF@ REPLACE_PSELECT = @REPLACE_PSELECT@ REPLACE_PTHREAD_SIGMASK = @REPLACE_PTHREAD_SIGMASK@ +REPLACE_PTSNAME_R = @REPLACE_PTSNAME_R@ REPLACE_PUTENV = @REPLACE_PUTENV@ REPLACE_PWRITE = @REPLACE_PWRITE@ +REPLACE_RAISE = @REPLACE_RAISE@ +REPLACE_RANDOM_R = @REPLACE_RANDOM_R@ REPLACE_READ = @REPLACE_READ@ REPLACE_READLINK = @REPLACE_READLINK@ REPLACE_REALLOC = @REPLACE_REALLOC@ @@ -985,6 +1077,7 @@ REPLACE_RENAMEAT = @REPLACE_RENAMEAT@ REPLACE_RMDIR = @REPLACE_RMDIR@ REPLACE_SELECT = @REPLACE_SELECT@ REPLACE_SETENV = @REPLACE_SETENV@ +REPLACE_SETLOCALE = @REPLACE_SETLOCALE@ REPLACE_SLEEP = @REPLACE_SLEEP@ REPLACE_SNPRINTF = @REPLACE_SNPRINTF@ REPLACE_SPRINTF = @REPLACE_SPRINTF@ @@ -1004,6 +1097,8 @@ REPLACE_STRSIGNAL = @REPLACE_STRSIGNAL@ REPLACE_STRSTR = @REPLACE_STRSTR@ REPLACE_STRTOD = @REPLACE_STRTOD@ REPLACE_STRTOK_R = @REPLACE_STRTOK_R@ +REPLACE_STRUCT_LCONV = @REPLACE_STRUCT_LCONV@ +REPLACE_STRUCT_TIMEVAL = @REPLACE_STRUCT_TIMEVAL@ REPLACE_SYMLINK = @REPLACE_SYMLINK@ REPLACE_TIMEGM = @REPLACE_TIMEGM@ REPLACE_TMPFILE = @REPLACE_TMPFILE@ @@ -1034,6 +1129,7 @@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ SIG_ATOMIC_T_SUFFIX = @SIG_ATOMIC_T_SUFFIX@ SIZE_T_SUFFIX = @SIZE_T_SUFFIX@ +STDALIGN_H = @STDALIGN_H@ STDBOOL_H = @STDBOOL_H@ STDDEF_H = @STDDEF_H@ STDINT_H = @STDINT_H@ @@ -1048,6 +1144,8 @@ UNISTD_H_HAVE_WINSOCK2_H_AND_USE_SOCKETS = @UNISTD_H_HAVE_WINSOCK2_H_AND_USE_SOC USE_NLS = @USE_NLS@ VERSION = @VERSION@ WCHAR_T_SUFFIX = @WCHAR_T_SUFFIX@ +WINDOWS_64_BIT_OFF_T = @WINDOWS_64_BIT_OFF_T@ +WINDOWS_64_BIT_ST_SIZE = @WINDOWS_64_BIT_ST_SIZE@ WINT_T_SUFFIX = @WINT_T_SUFFIX@ XGETTEXT = @XGETTEXT@ XGETTEXT_015 = @XGETTEXT_015@ @@ -1114,13 +1212,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c \ css_.c css-url.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ - recur.c res.c retr.c spider.c url.c \ + recur.c res.c retr.c spider.c url.c warc.c \ utils.c exits.c build_info.c $(IRI_OBJ) \ css-url.h css-tokens.h connect.h convert.h cookies.h \ ftp.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ - spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \ + spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h \ exits.h gettext.h nodist_wget_SOURCES = version.c @@ -1171,10 +1269,8 @@ $(ACLOCAL_M4): $(am__aclocal_m4_deps) $(am__aclocal_m4_deps): config.h: stamp-h1 - @if test ! -f $@; then \ - rm -f stamp-h1; \ - $(MAKE) $(AM_MAKEFLAGS) stamp-h1; \ - else :; fi + @if test ! -f $@; then rm -f stamp-h1; else :; fi + @if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status @rm -f stamp-h1 @@ -1189,14 +1285,17 @@ distclean-hdr: clean-checkLIBRARIES: -test -z "$(check_LIBRARIES)" || rm -f $(check_LIBRARIES) -libunittest.a: $(libunittest_a_OBJECTS) $(libunittest_a_DEPENDENCIES) +libunittest.a: $(libunittest_a_OBJECTS) $(libunittest_a_DEPENDENCIES) $(EXTRA_libunittest_a_DEPENDENCIES) -rm -f libunittest.a $(libunittest_a_AR) libunittest.a $(libunittest_a_OBJECTS) $(libunittest_a_LIBADD) $(RANLIB) libunittest.a install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) - test -z "$(bindir)" || $(MKDIR_P) "$(DESTDIR)$(bindir)" @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ + if test -n "$$list"; then \ + echo " $(MKDIR_P) '$(DESTDIR)$(bindir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(bindir)" || exit 1; \ + fi; \ for p in $$list; do echo "$$p $$p"; done | \ sed 's/$(EXEEXT)$$//' | \ while read p p1; do if test -f $$p; \ @@ -1230,7 +1329,7 @@ uninstall-binPROGRAMS: clean-binPROGRAMS: -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) -wget$(EXEEXT): $(wget_OBJECTS) $(wget_DEPENDENCIES) +wget$(EXEEXT): $(wget_OBJECTS) $(wget_DEPENDENCIES) $(EXTRA_wget_DEPENDENCIES) @rm -f wget$(EXEEXT) $(LINK) $(wget_OBJECTS) $(wget_LDADD) $(LIBS) @@ -1294,6 +1393,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libunittest_a-url.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libunittest_a-utils.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libunittest_a-version.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libunittest_a-warc.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/log.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/netrc.Po@am__quote@ @@ -1306,6 +1406,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/url.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utils.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/version.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/warc.Po@am__quote@ .c.o: @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @@ -1671,6 +1772,20 @@ libunittest_a-url.obj: url.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libunittest_a_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libunittest_a-url.obj `if test -f 'url.c'; then $(CYGPATH_W) 'url.c'; else $(CYGPATH_W) '$(srcdir)/url.c'; fi` +libunittest_a-warc.o: warc.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libunittest_a_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libunittest_a-warc.o -MD -MP -MF $(DEPDIR)/libunittest_a-warc.Tpo -c -o libunittest_a-warc.o `test -f 'warc.c' || echo '$(srcdir)/'`warc.c +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libunittest_a-warc.Tpo $(DEPDIR)/libunittest_a-warc.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='warc.c' object='libunittest_a-warc.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libunittest_a_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libunittest_a-warc.o `test -f 'warc.c' || echo '$(srcdir)/'`warc.c + +libunittest_a-warc.obj: warc.c +@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libunittest_a_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libunittest_a-warc.obj -MD -MP -MF $(DEPDIR)/libunittest_a-warc.Tpo -c -o libunittest_a-warc.obj `if test -f 'warc.c'; then $(CYGPATH_W) 'warc.c'; else $(CYGPATH_W) '$(srcdir)/warc.c'; fi` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libunittest_a-warc.Tpo $(DEPDIR)/libunittest_a-warc.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='warc.c' object='libunittest_a-warc.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libunittest_a_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libunittest_a-warc.obj `if test -f 'warc.c'; then $(CYGPATH_W) 'warc.c'; else $(CYGPATH_W) '$(srcdir)/warc.c'; fi` + libunittest_a-utils.o: utils.c @am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libunittest_a_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libunittest_a-utils.o -MD -MP -MF $(DEPDIR)/libunittest_a-utils.Tpo -c -o libunittest_a-utils.o `test -f 'utils.c' || echo '$(srcdir)/'`utils.c @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libunittest_a-utils.Tpo $(DEPDIR)/libunittest_a-utils.Po @@ -1855,10 +1970,15 @@ install-am: all-am installcheck: installcheck-am install-strip: - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - `test -z '$(STRIP)' || \ - echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi mostlyclean-generic: clean-generic: diff --git a/src/build_info.c b/src/build_info.c index f046444..8ac66c4 100644 --- a/src/build_info.c +++ b/src/build_info.c @@ -60,7 +60,7 @@ const char *compiled_features[] = "-opie", #endif -#if defined HAVE_LIBSSL +#if defined HAVE_LIBSSL || defined HAVE_LIBSSL32 "+ssl/openssl", #elif defined HAVE_LIBGNUTLS "+ssl/gnutls", diff --git a/src/build_info.c.in b/src/build_info.c.in index 892962a..c0b1677 100644 --- a/src/build_info.c.in +++ b/src/build_info.c.in @@ -9,5 +9,5 @@ ntlm defined ENABLE_NTLM opie defined ENABLE_OPIE ssl choice: - openssl defined HAVE_LIBSSL + openssl defined HAVE_LIBSSL || defined HAVE_LIBSSL32 gnutls defined HAVE_LIBGNUTLS diff --git a/src/config.h.in b/src/config.h.in index 2763488..123560b 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -26,6 +26,13 @@ /* Define to 1 if using `alloca.c'. */ #undef C_ALLOCA +/* Define as the bit index in the word where to find bit 0 of the exponent of + 'double'. */ +#undef DBL_EXPBIT0_BIT + +/* Define as the word index where to find the exponent of 'double'. */ +#undef DBL_EXPBIT0_WORD + /* Define to 1 if // is a file system root distinct from /. */ #undef DOUBLE_SLASH_IS_DISTINCT_ROOT @@ -54,10 +61,16 @@ /* Define this to 1 if F_DUPFD behavior does not match POSIX */ #undef FCNTL_DUPFD_BUGGY +/* Define to 1 if the system's ftello function has the Solaris bug. */ +#undef FTELLO_BROKEN_AFTER_SWITCHING_FROM_READ_TO_WRITE + /* Define to 1 if mkdir mistakenly creates a directory given with a trailing dot component. */ #undef FUNC_MKDIR_DOT_BUG +/* Define to 1 if nl_langinfo (YESEXPR) returns a non-empty string. */ +#undef FUNC_NL_LANGINFO_YESEXPR_WORKS + /* Define to 1 if futimesat mishandles a NULL file name. */ #undef FUTIMESAT_NULL_BUG @@ -73,10 +86,22 @@ #undef GNULIB_FD_SAFER_FLAG /* Define to a C preprocessor expression that evaluates to 1 or 0, depending + whether the gnulib module fscanf shall be considered present. */ +#undef GNULIB_FSCANF + +/* Define to a C preprocessor expression that evaluates to 1 or 0, depending + whether the gnulib module malloc-gnu shall be considered present. */ +#undef GNULIB_MALLOC_GNU + +/* Define to a C preprocessor expression that evaluates to 1 or 0, depending whether the gnulib module pipe2-safer shall be considered present. */ #undef GNULIB_PIPE2_SAFER /* Define to a C preprocessor expression that evaluates to 1 or 0, depending + whether the gnulib module scanf shall be considered present. */ +#undef GNULIB_SCANF + +/* Define to a C preprocessor expression that evaluates to 1 or 0, depending whether the gnulib module sigpipe shall be considered present. */ #undef GNULIB_SIGPIPE @@ -94,6 +119,9 @@ /* Define to 1 when the gnulib module bind should be tested. */ #undef GNULIB_TEST_BIND +/* Define to 1 when the gnulib module btowc should be tested. */ +#undef GNULIB_TEST_BTOWC + /* Define to 1 when the gnulib module cloexec should be tested. */ #undef GNULIB_TEST_CLOEXEC @@ -118,6 +146,15 @@ /* Define to 1 when the gnulib module fseeko should be tested. */ #undef GNULIB_TEST_FSEEKO +/* Define to 1 when the gnulib module fstat should be tested. */ +#undef GNULIB_TEST_FSTAT + +/* Define to 1 when the gnulib module ftell should be tested. */ +#undef GNULIB_TEST_FTELL + +/* Define to 1 when the gnulib module ftello should be tested. */ +#undef GNULIB_TEST_FTELLO + /* Define to 1 when the gnulib module futimens should be tested. */ #undef GNULIB_TEST_FUTIMENS @@ -151,6 +188,9 @@ /* Define to 1 when the gnulib module listen should be tested. */ #undef GNULIB_TEST_LISTEN +/* Define to 1 when the gnulib module localeconv should be tested. */ +#undef GNULIB_TEST_LOCALECONV + /* Define to 1 when the gnulib module lseek should be tested. */ #undef GNULIB_TEST_LSEEK @@ -172,6 +212,12 @@ /* Define to 1 when the gnulib module memchr should be tested. */ #undef GNULIB_TEST_MEMCHR +/* Define to 1 when the gnulib module mkstemp should be tested. */ +#undef GNULIB_TEST_MKSTEMP + +/* Define to 1 when the gnulib module nl_langinfo should be tested. */ +#undef GNULIB_TEST_NL_LANGINFO + /* Define to 1 when the gnulib module open should be tested. */ #undef GNULIB_TEST_OPEN @@ -217,6 +263,9 @@ tested. */ #undef GNULIB_TEST_POSIX_SPAWN_FILE_ACTIONS_INIT +/* Define to 1 when the gnulib module raise should be tested. */ +#undef GNULIB_TEST_RAISE + /* Define to 1 when the gnulib module rawmemchr should be tested. */ #undef GNULIB_TEST_RAWMEMCHR @@ -262,12 +311,21 @@ /* Define to 1 when the gnulib module strerror_r should be tested. */ #undef GNULIB_TEST_STRERROR_R +/* Define to 1 when the gnulib module strtok_r should be tested. */ +#undef GNULIB_TEST_STRTOK_R + /* Define to 1 when the gnulib module vasprintf should be tested. */ #undef GNULIB_TEST_VASPRINTF +/* Define to 1 when the gnulib module vsnprintf should be tested. */ +#undef GNULIB_TEST_VSNPRINTF + /* Define to 1 when the gnulib module waitpid should be tested. */ #undef GNULIB_TEST_WAITPID +/* Define to 1 when the gnulib module wcrtomb should be tested. */ +#undef GNULIB_TEST_WCRTOMB + /* Define to 1 when the gnulib module write should be tested. */ #undef GNULIB_TEST_WRITE @@ -285,6 +343,9 @@ /* Define to 1 if you have the <bp-sym.h> header file. */ #undef HAVE_BP_SYM_H +/* Define to 1 if you have the `btowc' function. */ +#undef HAVE_BTOWC + /* Define to 1 if you have the `catgets' function. */ #undef HAVE_CATGETS @@ -353,6 +414,10 @@ */ #undef HAVE_DECL_FSEEKO +/* Define to 1 if you have the declaration of `ftello', and to 0 if you don't. + */ +#undef HAVE_DECL_FTELLO + /* Define to 1 if you have the declaration of `funlockfile', and to 0 if you don't. */ #undef HAVE_DECL_FUNLOCKFILE @@ -409,6 +474,10 @@ don't. */ #undef HAVE_DECL_INET_NTOP +/* Define to 1 if you have the declaration of `isblank', and to 0 if you + don't. */ +#undef HAVE_DECL_ISBLANK + /* Define to 1 if you have the declaration of `mbrtowc', and to 0 if you don't. */ #undef HAVE_DECL_MBRTOWC @@ -437,10 +506,22 @@ don't. */ #undef HAVE_DECL_STRNCASECMP +/* Define to 1 if you have the declaration of `strtok_r', and to 0 if you + don't. */ +#undef HAVE_DECL_STRTOK_R + /* Define to 1 if you have the declaration of `towlower', and to 0 if you don't. */ #undef HAVE_DECL_TOWLOWER +/* Define to 1 if you have the declaration of `vsnprintf', and to 0 if you + don't. */ +#undef HAVE_DECL_VSNPRINTF + +/* Define to 1 if you have the declaration of `wcrtomb', and to 0 if you + don't. */ +#undef HAVE_DECL_WCRTOMB + /* Define to 1 if you have the declaration of `_snprintf', and to 0 if you don't. */ #undef HAVE_DECL__SNPRINTF @@ -560,12 +641,21 @@ /* Define to 1 if you have the `isatty' function. */ #undef HAVE_ISATTY +/* Define to 1 if you have the `isblank' function. */ +#undef HAVE_ISBLANK + /* Define to 1 if you have the `iswcntrl' function. */ #undef HAVE_ISWCNTRL +/* Define to 1 if you have the `iswctype' function. */ +#undef HAVE_ISWCTYPE + /* Define if you have <langinfo.h> and nl_langinfo(CODESET). */ #undef HAVE_LANGINFO_CODESET +/* Define to 1 if you have the <langinfo.h> header file. */ +#undef HAVE_LANGINFO_H + /* Define to 1 if you have the `dl' library (-ldl). */ #undef HAVE_LIBDL @@ -581,9 +671,15 @@ /* Define to 1 if you have the `gpg-error' library (-lgpg-error). */ #undef HAVE_LIBGPG_ERROR +/* Define to 1 if you have the <libintl.h> header file. */ +#undef HAVE_LIBINTL_H + /* Define to 1 if you have the `nsl' library (-lnsl). */ #undef HAVE_LIBNSL +/* Define if libpcre is available. */ +#undef HAVE_LIBPCRE + /* Define to 1 if you have the `rt' library (-lrt). */ #undef HAVE_LIBRT @@ -593,10 +689,16 @@ /* Define if you have the libssl library. */ #undef HAVE_LIBSSL +/* Define to 1 if you have the `ssl32' library (-lssl32). */ +#undef HAVE_LIBSSL32 + +/* Define if libuuid is available. */ +#undef HAVE_LIBUUID + /* Define to 1 if you have the `z' library (-lz). */ #undef HAVE_LIBZ -/* Define to 1 if the system has the type `long long int'. */ +/* Define to 1 if the system has the type 'long long int'. */ #undef HAVE_LONG_LONG_INT /* Define to 1 if you have the `lstat' function. */ @@ -605,6 +707,10 @@ /* Define to 1 if you have the `lutimes' function. */ #undef HAVE_LUTIMES +/* Define to 1 if your system has a GNU libc compatible 'malloc' function, and + to 0 otherwise. */ +#undef HAVE_MALLOC_GNU + /* Define if the 'malloc' function is POSIX compliant. */ #undef HAVE_MALLOC_POSIX @@ -630,12 +736,19 @@ /* Define to 1 if you have the `memrchr' function. */ #undef HAVE_MEMRCHR +/* Define to 1 if you have the `mkstemp' function. */ +#undef HAVE_MKSTEMP + /* Define to 1 if you have a working `mmap' system call. */ #undef HAVE_MMAP /* Define to 1 if you have the `mprotect' function. */ #undef HAVE_MPROTECT +/* Define to 1 on MSVC platforms that have the "invalid parameter handler" + concept. */ +#undef HAVE_MSVC_INVALID_PARAMETER_HANDLER + /* Define if you have the nanosleep function. */ #undef HAVE_NANOSLEEP @@ -648,6 +761,9 @@ /* Define to 1 if you have the <netinet/in.h> header file. */ #undef HAVE_NETINET_IN_H +/* Define to 1 if you have the `nl_langinfo' function. */ +#undef HAVE_NL_LANGINFO + /* Define to 1 if you have the <paths.h> header file. */ #undef HAVE_PATHS_H @@ -675,6 +791,9 @@ /* Define to 1 if you have the <pwd.h> header file. */ #undef HAVE_PWD_H +/* Define to 1 if you have the `raise' function. */ +#undef HAVE_RAISE + /* Define to 1 if you have the `rawmemchr' function. */ #undef HAVE_RAWMEMCHR @@ -697,6 +816,9 @@ macros. */ #undef HAVE_RAW_DECL_CANONICALIZE_FILE_NAME +/* Define to 1 if chdir is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_CHDIR + /* Define to 1 if chown is declared even after undefining macros. */ #undef HAVE_RAW_DECL_CHOWN @@ -706,12 +828,18 @@ /* Define to 1 if dprintf is declared even after undefining macros. */ #undef HAVE_RAW_DECL_DPRINTF +/* Define to 1 if dup is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_DUP + /* Define to 1 if dup2 is declared even after undefining macros. */ #undef HAVE_RAW_DECL_DUP2 /* Define to 1 if dup3 is declared even after undefining macros. */ #undef HAVE_RAW_DECL_DUP3 +/* Define to 1 if duplocale is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_DUPLOCALE + /* Define to 1 if endusershell is declared even after undefining macros. */ #undef HAVE_RAW_DECL_ENDUSERSHELL @@ -736,6 +864,9 @@ /* Define to 1 if fcntl is declared even after undefining macros. */ #undef HAVE_RAW_DECL_FCNTL +/* Define to 1 if fdatasync is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_FDATASYNC + /* Define to 1 if ffs is declared even after undefining macros. */ #undef HAVE_RAW_DECL_FFS @@ -754,6 +885,9 @@ /* Define to 1 if fseeko is declared even after undefining macros. */ #undef HAVE_RAW_DECL_FSEEKO +/* Define to 1 if fstat is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_FSTAT + /* Define to 1 if fstatat is declared even after undefining macros. */ #undef HAVE_RAW_DECL_FSTATAT @@ -814,6 +948,9 @@ /* Define to 1 if getpeername is declared even after undefining macros. */ #undef HAVE_RAW_DECL_GETPEERNAME +/* Define to 1 if gets is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_GETS + /* Define to 1 if getsockname is declared even after undefining macros. */ #undef HAVE_RAW_DECL_GETSOCKNAME @@ -841,12 +978,18 @@ /* Define to 1 if inet_pton is declared even after undefining macros. */ #undef HAVE_RAW_DECL_INET_PTON -/* Define to 1 if initstat_r is declared even after undefining macros. */ -#undef HAVE_RAW_DECL_INITSTAT_R +/* Define to 1 if initstate is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_INITSTATE + +/* Define to 1 if initstate_r is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_INITSTATE_R /* Define to 1 if ioctl is declared even after undefining macros. */ #undef HAVE_RAW_DECL_IOCTL +/* Define to 1 if isatty is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_ISATTY + /* Define to 1 if iswctype is declared even after undefining macros. */ #undef HAVE_RAW_DECL_ISWCTYPE @@ -925,9 +1068,15 @@ /* Define to 1 if mkstemps is declared even after undefining macros. */ #undef HAVE_RAW_DECL_MKSTEMPS +/* Define to 1 if nl_langinfo is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_NL_LANGINFO + /* Define to 1 if openat is declared even after undefining macros. */ #undef HAVE_RAW_DECL_OPENAT +/* Define to 1 if pclose is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_PCLOSE + /* Define to 1 if pipe is declared even after undefining macros. */ #undef HAVE_RAW_DECL_PIPE @@ -937,6 +1086,9 @@ /* Define to 1 if popen is declared even after undefining macros. */ #undef HAVE_RAW_DECL_POPEN +/* Define to 1 if posix_openpt is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_POSIX_OPENPT + /* Define to 1 if posix_spawn is declared even after undefining macros. */ #undef HAVE_RAW_DECL_POSIX_SPAWN @@ -1031,9 +1183,15 @@ /* Define to 1 if ptsname is declared even after undefining macros. */ #undef HAVE_RAW_DECL_PTSNAME +/* Define to 1 if ptsname_r is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_PTSNAME_R + /* Define to 1 if pwrite is declared even after undefining macros. */ #undef HAVE_RAW_DECL_PWRITE +/* Define to 1 if random is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_RANDOM + /* Define to 1 if random_r is declared even after undefining macros. */ #undef HAVE_RAW_DECL_RANDOM_R @@ -1076,9 +1234,18 @@ /* Define to 1 if setenv is declared even after undefining macros. */ #undef HAVE_RAW_DECL_SETENV +/* Define to 1 if sethostname is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_SETHOSTNAME + +/* Define to 1 if setlocale is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_SETLOCALE + /* Define to 1 if setsockopt is declared even after undefining macros. */ #undef HAVE_RAW_DECL_SETSOCKOPT +/* Define to 1 if setstate is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_SETSTATE + /* Define to 1 if setstate_r is declared even after undefining macros. */ #undef HAVE_RAW_DECL_SETSTATE_R @@ -1121,6 +1288,9 @@ /* Define to 1 if socket is declared even after undefining macros. */ #undef HAVE_RAW_DECL_SOCKET +/* Define to 1 if srandom is declared even after undefining macros. */ +#undef HAVE_RAW_DECL_SRANDOM + /* Define to 1 if srandom_r is declared even after undefining macros. */ #undef HAVE_RAW_DECL_SRANDOM_R @@ -1454,12 +1624,18 @@ /* Define to 1 if you have the `strptime' function. */ #undef HAVE_STRPTIME +/* Define to 1 if you have the `strtok_r' function. */ +#undef HAVE_STRTOK_R + /* Define to 1 if you have the `strtoll' function. */ #undef HAVE_STRTOLL /* Define to 1 if the system has the type `struct addrinfo'. */ #undef HAVE_STRUCT_ADDRINFO +/* Define to 1 if `decimal_point' is a member of `struct lconv'. */ +#undef HAVE_STRUCT_LCONV_DECIMAL_POINT + /* Define to 1 if `sa_sigaction' is a member of `struct sigaction'. */ #undef HAVE_STRUCT_SIGACTION_SA_SIGACTION @@ -1568,7 +1744,7 @@ /* Define to 1 if you have the <unistd.h> header file. */ #undef HAVE_UNISTD_H -/* Define to 1 if the system has the type `unsigned long long int'. */ +/* Define to 1 if the system has the type 'unsigned long long int'. */ #undef HAVE_UNSIGNED_LONG_LONG_INT /* Define to 1 if you have the `usleep' function. */ @@ -1607,6 +1783,9 @@ /* Define to 1 if you have the `wcrtomb' function. */ #undef HAVE_WCRTOMB +/* Define to 1 if you have the `wcscoll' function. */ +#undef HAVE_WCSCOLL + /* Define to 1 if you have the `wcslen' function. */ #undef HAVE_WCSLEN @@ -1634,21 +1813,40 @@ /* Define to 1 if O_NOFOLLOW works. */ #undef HAVE_WORKING_O_NOFOLLOW +/* Define if you have the posix_spawn and posix_spawnp functions and they + work. */ +#undef HAVE_WORKING_POSIX_SPAWN + /* Define if utimes works properly. */ #undef HAVE_WORKING_UTIMES /* Define to 1 if you have the <ws2tcpip.h> header file. */ #undef HAVE_WS2TCPIP_H +/* Define to 1 if you have the <xlocale.h> header file. */ +#undef HAVE_XLOCALE_H + /* Define to 1 if the system has the type `_Bool'. */ #undef HAVE__BOOL +/* Define to 1 if you have the `_fseeki64' function. */ +#undef HAVE__FSEEKI64 + +/* Define to 1 if you have the `_ftelli64' function. */ +#undef HAVE__FTELLI64 + /* Define to 1 if you have the `_ftime' function. */ #undef HAVE__FTIME +/* Define to 1 if you have the `_set_invalid_parameter_handler' function. */ +#undef HAVE__SET_INVALID_PARAMETER_HANDLER + /* Define to 1 if you have the `__fsetlocking' function. */ #undef HAVE___FSETLOCKING +/* Define to 1 if you have the `__secure_getenv' function. */ +#undef HAVE___SECURE_GETENV + /* Define to 1 if you have the `__xpg_strerror_r' function. */ #undef HAVE___XPG_STRERROR_R @@ -1658,7 +1856,7 @@ /* Define to 1 if lseek does not detect pipes. */ #undef LSEEK_PIPE_BROKEN -/* Define to 1 if `lstat' dereferences a symlink specified with a trailing +/* Define to 1 if 'lstat' dereferences a symlink specified with a trailing slash. */ #undef LSTAT_FOLLOWS_SLASHED_SYMLINK @@ -1726,6 +1924,9 @@ slash */ #undef REPLACE_FUNC_STAT_FILE +/* Define if nl_langinfo exists but is overridden by gnulib. */ +#undef REPLACE_NL_LANGINFO + /* Define to 1 if strerror(0) does not return a message implying success. */ #undef REPLACE_STRERROR_0 @@ -1767,9 +1968,9 @@ /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be automatically deduced at runtime. - STACK_DIRECTION > 0 => grows toward higher addresses - STACK_DIRECTION < 0 => grows toward lower addresses - STACK_DIRECTION = 0 => direction of growth unknown */ + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ #undef STACK_DIRECTION /* Define to 1 if the `S_IS*' macros in <sys/stat.h> do not work properly. */ @@ -1815,8 +2016,8 @@ safe for multithreaded apps. */ #undef USE_UNLOCKED_IO -/* Define if the Win32 multithreading API can be used. */ -#undef USE_WIN32_THREADS +/* Define if the native Windows multithreading API can be used. */ +#undef USE_WINDOWS_THREADS /* Version number of package */ #undef VERSION @@ -1848,7 +2049,7 @@ `char[]'. */ #undef YYTEXT_POINTER -/* Enable large inode numbers on Mac OS X. */ +/* Enable large inode numbers on Mac OS X 10.5. */ #ifndef _DARWIN_USE_64_BIT_INODE # define _DARWIN_USE_64_BIT_INODE 1 #endif @@ -1856,21 +2057,29 @@ /* Number of bits in a file offset, on hosts where this is settable. */ #undef _FILE_OFFSET_BITS +/* Define to 1 if Gnulib overrides 'struct stat' on Windows so that struct + stat.st_size becomes 64-bit. */ +#undef _GL_WINDOWS_64_BIT_ST_SIZE + /* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */ #undef _LARGEFILE_SOURCE /* Define for large files, on AIX-style hosts. */ #undef _LARGE_FILES +/* Define to 1 on Solaris. */ +#undef _LCONV_C99 + /* Define to 1 if on MINIX. */ #undef _MINIX -/* The _Noreturn keyword of draft C1X. */ -#ifndef _Noreturn +/* The _Noreturn keyword of C11. */ +#if ! (defined _Noreturn \ + || (defined __STDC_VERSION__ && 201112 <= __STDC_VERSION__)) # if (3 <= __GNUC__ || (__GNUC__ == 2 && 8 <= __GNUC_MINOR__) \ || 0x5110 <= __SUNPRO_C) # define _Noreturn __attribute__ ((__noreturn__)) -# elif 1200 <= _MSC_VER +# elif defined _MSC_VER && 1200 <= _MSC_VER # define _Noreturn __declspec (noreturn) # else # define _Noreturn @@ -1886,9 +2095,16 @@ functions. */ #undef _POSIX_PII_SOCKET -/* Define to 1 if you need to in order for `stat' and other things to work. */ +/* Define to 1 if you need to in order for 'stat' and other things to work. */ #undef _POSIX_SOURCE +/* Define if you want <regex.h> to include <limits.h>, so that it consistently + overrides <limits.h>'s RE_DUP_MAX. */ +#undef _REGEX_INCLUDE_LIMITS_H + +/* Define if you want regoff_t to be at least as wide POSIX requires. */ +#undef _REGEX_LARGE_OFFSETS + /* Define to 500 only on HP-UX. */ #undef _XOPEN_SOURCE @@ -1896,7 +2112,7 @@ #ifndef _ALL_SOURCE # undef _ALL_SOURCE #endif -/* Enable general extensions on MacOS X. */ +/* Enable general extensions on Mac OS X. */ #ifndef _DARWIN_C_SOURCE # undef _DARWIN_C_SOURCE #endif @@ -1925,6 +2141,35 @@ /* Define to empty if `const' does not conform to ANSI C. */ #undef const +/* _GL_INLINE is a portable alternative to ISO C99 plain 'inline'. + _GL_EXTERN_INLINE is a portable alternative to 'extern inline'. + _GL_INLINE_HEADER_BEGIN contains useful stuff to put + in an include file, before uses of _GL_INLINE. + It suppresses GCC's bogus "no previous prototype for 'FOO'" diagnostic, + when FOO is an inline function in the header; see + <http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54113>. + _GL_INLINE_HEADER_END contains useful stuff to put + in the same include file, after uses of _GL_INLINE. */ +#if __GNUC__ ? __GNUC_STDC_INLINE__ : 199901L <= __STDC_VERSION__ +# define _GL_INLINE inline +# define _GL_EXTERN_INLINE extern inline +# if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__ +# define _GL_INLINE_HEADER_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wmissing-prototypes\"") +# define _GL_INLINE_HEADER_END \ + _Pragma ("GCC diagnostic pop") +# endif +#else +# define _GL_INLINE static inline +# define _GL_EXTERN_INLINE static inline +#endif + +#ifndef _GL_INLINE_HEADER_BEGIN +# define _GL_INLINE_HEADER_BEGIN +# define _GL_INLINE_HEADER_END +#endif + /* Define to a replacement function name for getpass(). */ #undef getpass @@ -1946,7 +2191,7 @@ /* Work around a bug in Apple GCC 4.0.1 build 5465: In C99 mode, it supports the ISO C 99 semantics of 'extern inline' (unlike the GNU C semantics of earlier versions), but does not display it by setting __GNUC_STDC_INLINE__. - __APPLE__ && __MACH__ test for MacOS X. + __APPLE__ && __MACH__ test for Mac OS X. __APPLE_CC__ tests for the Apple compiler and its version. __STDC_VERSION__ tests for the C99 mode. */ #if defined __APPLE__ && defined __MACH__ && __APPLE_CC__ >= 5465 && !defined __cplusplus && __STDC_VERSION__ >= 199901L && !defined __GNUC_STDC_INLINE__ @@ -1972,6 +2217,51 @@ doesn't define it. */ #undef ptrdiff_t +/* Define to rpl_re_comp if the replacement should be used. */ +#undef re_comp + +/* Define to rpl_re_compile_fastmap if the replacement should be used. */ +#undef re_compile_fastmap + +/* Define to rpl_re_compile_pattern if the replacement should be used. */ +#undef re_compile_pattern + +/* Define to rpl_re_exec if the replacement should be used. */ +#undef re_exec + +/* Define to rpl_re_match if the replacement should be used. */ +#undef re_match + +/* Define to rpl_re_match_2 if the replacement should be used. */ +#undef re_match_2 + +/* Define to rpl_re_search if the replacement should be used. */ +#undef re_search + +/* Define to rpl_re_search_2 if the replacement should be used. */ +#undef re_search_2 + +/* Define to rpl_re_set_registers if the replacement should be used. */ +#undef re_set_registers + +/* Define to rpl_re_set_syntax if the replacement should be used. */ +#undef re_set_syntax + +/* Define to rpl_re_syntax_options if the replacement should be used. */ +#undef re_syntax_options + +/* Define to rpl_regcomp if the replacement should be used. */ +#undef regcomp + +/* Define to rpl_regerror if the replacement should be used. */ +#undef regerror + +/* Define to rpl_regexec if the replacement should be used. */ +#undef regexec + +/* Define to rpl_regfree if the replacement should be used. */ +#undef regfree + /* Define to the equivalent of the C99 'restrict' keyword, or to nothing if this is not supported. Do not define if restrict is supported directly. */ @@ -1987,7 +2277,7 @@ #endif /* Define as an integer type suitable for memory locations that can be - accessed atomically even in the presence of asynchnonous signals. */ + accessed atomically even in the presence of asynchronous signals. */ #undef sig_atomic_t /* Define to `unsigned int' if <sys/types.h> does not define. */ @@ -1996,6 +2286,9 @@ /* type to use in place of socklen_t if not defined */ #undef socklen_t +/* Define as a signed type of the same size as size_t. */ +#undef ssize_t + /* Define to `int' if <sys/types.h> doesn't define. */ #undef uid_t diff --git a/src/connect.c b/src/connect.c index e12c049..119ccb7 100644 --- a/src/connect.c +++ b/src/connect.c @@ -53,9 +53,7 @@ as that of the covered work. */ #include <errno.h> #include <string.h> -#ifdef HAVE_SYS_TIME_H -# include <sys/time.h> -#endif +#include <sys/time.h> #include "utils.h" #include "host.h" #include "connect.h" @@ -293,7 +291,12 @@ connect_to_ip (const ip_address *ip, int port, const char *print) xfree (str); } else - logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); + { + if (ip->family == AF_INET) + logprintf (LOG_VERBOSE, _("Connecting to %s:%d... "), txt_addr, port); + else if (ip->family == AF_INET6) + logprintf (LOG_VERBOSE, _("Connecting to [%s]:%d... "), txt_addr, port); + } } /* Store the sockaddr info to SA. */ @@ -581,6 +584,36 @@ socket_ip_address (int sock, ip_address *ip, int endpoint) } } +/* Get the socket family of connection on FD and store + Return family type on success, -1 otherwise. + + If ENDPOINT is ENDPOINT_LOCAL, it returns the sock family of the local + (client) side of the socket. Else if ENDPOINT is ENDPOINT_PEER, it + returns the sock family of the remote (peer's) side of the socket. */ + +int +socket_family (int sock, int endpoint) +{ + struct sockaddr_storage storage; + struct sockaddr *sockaddr = (struct sockaddr *) &storage; + socklen_t addrlen = sizeof (storage); + int ret; + + memset (sockaddr, 0, addrlen); + + if (endpoint == ENDPOINT_LOCAL) + ret = getsockname (sock, sockaddr, &addrlen); + else if (endpoint == ENDPOINT_PEER) + ret = getpeername (sock, sockaddr, &addrlen); + else + abort (); + + if (ret < 0) + return -1; + + return sockaddr->sa_family; +} + /* Return true if the error from the connect code can be considered retryable. Wget normally retries after errors, but the exception are the "unsupported protocol" type errors (possible on IPv4/IPv6 diff --git a/src/connect.h b/src/connect.h index 20bb243..bb3f26a 100644 --- a/src/connect.h +++ b/src/connect.h @@ -51,6 +51,7 @@ enum { ENDPOINT_PEER }; bool socket_ip_address (int, ip_address *, int); +int socket_family (int sock, int endpoint); bool retryable_socket_connect_error (int); diff --git a/src/convert.c b/src/convert.c index c6ccf53..f5a9cba 100644 --- a/src/convert.c +++ b/src/convert.c @@ -58,7 +58,7 @@ struct hash_table *downloaded_css_set; static void convert_links (const char *, struct urlpos *); -void +static void convert_links_in_hashtable (struct hash_table *downloaded_set, int is_css, int *file_count) @@ -124,6 +124,9 @@ convert_links_in_hashtable (struct hash_table *downloaded_set, set_uri_encoding (pi, opt.locale, true); u = url_parse (cur_url->url->url, NULL, pi, true); + if (!u) + continue; + local_name = hash_table_get (dl_url_file_map, u->url); /* Decide on the conversion type. */ @@ -870,7 +873,7 @@ register_delete_file (const char *file) /* Register that FILE is an HTML file that has been downloaded. */ void -register_html (const char *url, const char *file) +register_html (const char *file) { if (!downloaded_html_set) downloaded_html_set = make_string_hash_table (0); @@ -880,7 +883,7 @@ register_html (const char *url, const char *file) /* Register that FILE is a CSS file that has been downloaded. */ void -register_css (const char *url, const char *file) +register_css (const char *file) { if (!downloaded_css_set) downloaded_css_set = make_string_hash_table (0); diff --git a/src/convert.h b/src/convert.h index 1f034e5..cdd0a48 100644 --- a/src/convert.h +++ b/src/convert.h @@ -101,8 +101,8 @@ downloaded_file_t downloaded_file (downloaded_file_t, const char *); void register_download (const char *, const char *); void register_redirection (const char *, const char *); -void register_html (const char *, const char *); -void register_css (const char *, const char *); +void register_html (const char *); +void register_css (const char *); void register_delete_file (const char *); void convert_all_links (void); void convert_cleanup (void); diff --git a/src/cookies.c b/src/cookies.c index 7c3fb1c..a10971c 100644 --- a/src/cookies.c +++ b/src/cookies.c @@ -391,6 +391,9 @@ parse_set_cookie (const char *set_cookie, bool silent) goto error; BOUNDED_TO_ALLOCA (value.b, value.e, value_copy); + /* Check if expiration spec is valid. + If not, assume default (cookie doesn't expire, but valid only for + this session.) */ expires = http_atotm (value_copy); if (expires != (time_t) -1) { @@ -402,10 +405,6 @@ parse_set_cookie (const char *set_cookie, bool silent) if (cookie->expiry_time < cookies_now) cookie->discard_requested = 1; } - else - /* Error in expiration spec. Assume default (cookie doesn't - expire, but valid only for this session.) */ - ; } else if (TOKEN_IS (name, "max-age")) { @@ -433,9 +432,7 @@ parse_set_cookie (const char *set_cookie, bool silent) /* ignore value completely */ cookie->secure = 1; } - else - /* Ignore unrecognized attribute. */ - ; + /* else: Ignore unrecognized attribute. */ } if (*ptr) /* extract_param has encountered a syntax error */ diff --git a/src/css-url.c b/src/css-url.c index de1caad..f97690d 100644 --- a/src/css-url.c +++ b/src/css-url.c @@ -55,6 +55,7 @@ as that of the covered work. */ #include "convert.h" #include "html-url.h" #include "css-tokens.h" +#include "css-url.h" /* from lex.yy.c */ extern char *yytext; @@ -107,7 +108,7 @@ const char *token_names[] = { whitespace after the opening parenthesis and before the closing parenthesis. */ -char * +static char * get_uri_string (const char *at, int *pos, int *length) { char *uri; diff --git a/src/css-url.h b/src/css-url.h index 8d32c34..7f940e6 100644 --- a/src/css-url.h +++ b/src/css-url.h @@ -31,6 +31,7 @@ as that of the covered work. */ #define CSS_URL_H void get_urls_css (struct map_context *, int, int); +void get_urls_css (struct map_context *, int, int); struct urlpos *get_urls_css_file (const char *, const char *); #endif /* CSS_URL_H */ diff --git a/src/exits.c b/src/exits.c index 3d846b5..e23fc1c 100644 --- a/src/exits.c +++ b/src/exits.c @@ -1,7 +1,5 @@ -/* Command line parsing. - Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, - Inc. +/* Exit status handling. + Copyright (C) 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -60,7 +58,7 @@ get_status_for_err (uerr_t err) case RETROK: return WGET_EXIT_SUCCESS; case FOPENERR: case FOPEN_EXCL_ERR: case FWRITEERR: case WRITEFAILED: - case UNLINKERR: + case UNLINKERR: case CLOSEFAILED: return WGET_EXIT_IO_FAIL; case NOCONERROR: case HOSTERR: case CONSOCKERR: case CONERROR: case CONSSLERR: case CONIMPOSSIBLE: case FTPRERR: case FTPINVPASV: diff --git a/src/exits.h b/src/exits.h index dfe9516..98dde9a 100644 --- a/src/exits.h +++ b/src/exits.h @@ -1,5 +1,5 @@ -/* Internationalization related declarations. - Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc. +/* Exit status related declarations. + Copyright (C) 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. diff --git a/src/ftp-basic.c b/src/ftp-basic.c index 178fdfe..045d125 100644 --- a/src/ftp-basic.c +++ b/src/ftp-basic.c @@ -524,7 +524,10 @@ ftp_pasv (int csock, ip_address *addr, int *port) for (s += 4; *s && !c_isdigit (*s); s++) ; if (!*s) - return FTPINVPASV; + { + xfree (respline); + return FTPINVPASV; + } for (i = 0; i < 6; i++) { tmp[i] = 0; @@ -593,7 +596,10 @@ ftp_lpsv (int csock, ip_address *addr, int *port) for (s += 4; *s && !c_isdigit (*s); s++) ; if (!*s) - return FTPINVPASV; + { + xfree (respline); + return FTPINVPASV; + } /* First, get the address family */ af = 0; @@ -49,6 +49,7 @@ as that of the covered work. */ #include "netrc.h" #include "convert.h" /* for downloaded_file */ #include "recur.h" /* for INFINITE_RECURSION */ +#include "warc.h" #ifdef __VMS # include "vms.h" @@ -237,17 +238,17 @@ static uerr_t ftp_get_listing (struct url *, ccon *, struct fileinfo **); /* Retrieves a file with denoted parameters through opening an FTP connection to the server. It always closes the data connection, - and closes the control connection in case of error. */ + and closes the control connection in case of error. If warc_tmp + is non-NULL, the downloaded data will be written there as well. */ static uerr_t getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread, - wgint restval, ccon *con, int count) + wgint restval, ccon *con, int count, FILE *warc_tmp) { int csock, dtsock, local_sock, res; uerr_t err = RETROK; /* appease the compiler */ FILE *fp; - char *user, *passwd, *respline; - char *tms; - const char *tmrate; + char *respline, *tms; + const char *user, *passwd, *tmrate; int cmd = con->cmd; bool pasv_mode_open = false; wgint expected_bytes = 0; @@ -287,13 +288,6 @@ getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread, { char *host = con->proxy ? con->proxy->host : u->host; int port = con->proxy ? con->proxy->port : u->port; - char *logname = user; - - if (con->proxy) - { - /* If proxy is in use, log in as username@target-site. */ - logname = concat_strings (user, "@", u->host, (char *) 0); - } /* Login to the server: */ @@ -301,20 +295,10 @@ getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread, csock = connect_to_host (host, port); if (csock == E_HOST) - { - if (con->proxy) - xfree (logname); - return HOSTERR; - } else if (csock < 0) - { - if (con->proxy) - xfree (logname); - return (retryable_socket_connect_error (errno) ? CONERROR : CONIMPOSSIBLE); - } if (cmd & LEAVE_PENDING) con->csock = csock; @@ -326,10 +310,15 @@ getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread, quotearg_style (escape_quoting_style, user)); if (opt.server_response) logputs (LOG_ALWAYS, "\n"); - err = ftp_login (csock, logname, passwd); - if (con->proxy) - xfree (logname); + { + /* If proxy is in use, log in as username@target-site. */ + char *logname = concat_strings (user, "@", u->host, (char *) 0); + err = ftp_login (csock, logname, passwd); + xfree (logname); + } + else + err = ftp_login (csock, user, passwd); /* FTPRERR, FTPSRVERR, WRITEFAILED, FTPLOGREFUSED, FTPLOGINC */ switch (err) @@ -512,7 +501,7 @@ Error in server response, closing control connection.\n")); logputs (LOG_VERBOSE, _("==> CWD not needed.\n")); else { - char *targ = NULL; + const char *targ = NULL; int cwd_count; int cwd_end; int cwd_start; @@ -1152,13 +1141,25 @@ Error in server response, closing control connection.\n")); Elsewhere, define a constant "binary" flag. Isn't it nice to have distinct text and binary file types? */ -# define BIN_TYPE_TRANSFER (type_char != 'A') +/* 2011-09-30 SMS. + Added listing files to the set of non-"binary" (text, Stream_LF) + files. (Wget works either way, but other programs, like, say, text + editors, work better on listing files which have text attributes.) + Now we use "binary" attributes for a binary ("IMAGE") transfer, + unless "--ftp-stmlf" was specified, and we always use non-"binary" + (text, Stream_LF) attributes for a listing file, or for an ASCII + transfer. + Tidied the VMS-specific BIN_TYPE_xxx macros, and changed the call to + fopen_excl() (restored?) to use BIN_TYPE_FILE instead of "true". +*/ #ifdef __VMS +# define BIN_TYPE_TRANSFER (type_char != 'A') +# define BIN_TYPE_FILE \ + ((!(cmd & DO_LIST)) && BIN_TYPE_TRANSFER && (opt.ftp_stmlf == 0)) # define FOPEN_OPT_ARGS "fop=sqo", "acc", acc_cb, &open_id # define FOPEN_OPT_ARGS_BIN "ctx=bin,stm", "rfm=fix", "mrs=512" FOPEN_OPT_ARGS -# define BIN_TYPE_FILE (BIN_TYPE_TRANSFER && (opt.ftp_stmlf == 0)) #else /* def __VMS */ -# define BIN_TYPE_FILE 1 +# define BIN_TYPE_FILE true #endif /* def __VMS [else] */ if (restval && !(con->cmd & DO_LIST)) @@ -1182,7 +1183,7 @@ Error in server response, closing control connection.\n")); } else if (opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct || opt.output_document || count > 0) - { + { if (opt.unlink && file_exists_p (con->target)) { int res = unlink (con->target); @@ -1217,7 +1218,7 @@ Error in server response, closing control connection.\n")); } else { - fp = fopen_excl (con->target, true); + fp = fopen_excl (con->target, BIN_TYPE_FILE); if (!fp && errno == EEXIST) { /* We cannot just invent a new name and use it (which is @@ -1262,7 +1263,7 @@ Error in server response, closing control connection.\n")); rd_size = 0; res = fd_read_body (dtsock, fp, expected_bytes ? expected_bytes - restval : 0, - restval, &rd_size, qtyread, &con->dltime, flags); + restval, &rd_size, qtyread, &con->dltime, flags, warc_tmp); tms = datetime_str (time (NULL)); tmrate = retr_rate (rd_size, con->dltime); @@ -1273,15 +1274,18 @@ Error in server response, closing control connection.\n")); if (!output_stream || con->cmd & DO_LIST) fclose (fp); - /* If fd_read_body couldn't write to fp, bail out. */ - if (res == -2) + /* If fd_read_body couldn't write to fp or warc_tmp, bail out. */ + if (res == -2 || (warc_tmp != NULL && res == -3)) { logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"), con->target, strerror (errno)); fd_close (csock); con->csock = -1; fd_close (dtsock); - return FWRITEERR; + if (res == -2) + return FWRITEERR; + else if (res == -3) + return WARC_TMP_FWRITEERR; } else if (res == -1) { @@ -1397,6 +1401,11 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi uerr_t err; struct_stat st; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + ip_address *warc_ip = NULL; + /* Get the target, and set the name for the message accordingly. */ if ((f == NULL) && (con->target)) { @@ -1433,6 +1442,21 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0; + /* For file RETR requests, we can write a WARC record. + We record the file contents to a temporary file. */ + if (warc_enabled && (con->cmd & DO_RETR)) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + return WARC_TMP_FOPENERR; + + if (!con->proxy && con->csock != -1) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (con->csock, warc_ip, ENDPOINT_PEER); + } + } + /* THE loop. */ do { @@ -1497,7 +1521,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi len = f->size; else len = 0; - err = getftp (u, len, &qtyread, restval, con, count); + + /* If we are working on a WARC record, getftp should also write + to the warc_tmp file. */ + err = getftp (u, len, &qtyread, restval, con, count, warc_tmp); if (con->csock == -1) con->st &= ~DONE_CWD; @@ -1508,8 +1535,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi { case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR: case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED: - case UNLINKERR: + case UNLINKERR: case WARC_TMP_FWRITEERR: /* Fatal errors, give up. */ + if (warc_tmp != NULL) + fclose (warc_tmp); return err; case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR: case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR: @@ -1577,6 +1606,19 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi xfree (hurl); } + if (warc_enabled && (con->cmd & DO_RETR)) + { + /* Create and store a WARC resource record for the retrieved file. */ + bool warc_res; + + warc_res = warc_write_resource_record (NULL, u->url, NULL, NULL, + warc_ip, NULL, warc_tmp, -1); + if (! warc_res) + return WARC_ERR; + + /* warc_write_resource_record has also closed warc_tmp. */ + } + if ((con->cmd & DO_LIST)) /* This is a directory listing file. */ { @@ -1880,8 +1922,10 @@ Already have correct symlink %s -> %s\n\n"), set_local_file (&actual_target, con->target); - /* If downloading a plain file, set valid (non-zero) permissions. */ - if (dlthis && (actual_target != NULL) && (f->type == FT_PLAINFILE)) + /* If downloading a plain file, and the user requested it, then + set valid (non-zero) permissions. */ + if (dlthis && (actual_target != NULL) && + (f->type == FT_PLAINFILE) && opt.preserve_perm) { if (f->perms) chmod (actual_target, f->perms); @@ -1914,7 +1958,9 @@ Already have correct symlink %s -> %s\n\n"), xfree (ofile); /* Break on fatals. */ - if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR) + if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR + || err == WARC_ERR || err == WARC_TMP_FOPENERR + || err == WARC_TMP_FWRITEERR) break; con->cmd &= ~ (DO_CWD | DO_LOGIN); f = f->next; diff --git a/src/gnutls.c b/src/gnutls.c index 40a04ef..32c6d17 100644 --- a/src/gnutls.c +++ b/src/gnutls.c @@ -1,5 +1,5 @@ /* SSL support via GnuTLS library. - Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software + Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -54,15 +54,38 @@ as that of the covered work. */ # include "w32sock.h" #endif +#include "host.h" + +static int +key_type_to_gnutls_type (enum keyfile_type type) +{ + switch (type) + { + case keyfile_pem: + return GNUTLS_X509_FMT_PEM; + case keyfile_asn1: + return GNUTLS_X509_FMT_DER; + default: + abort (); + } +} + /* Note: some of the functions private to this file have names that begin with "wgnutls_" (e.g. wgnutls_read) so that they wouldn't be confused with actual gnutls functions -- such as the gnutls_read preprocessor macro. */ -static gnutls_certificate_credentials credentials; +static gnutls_certificate_credentials_t credentials; bool -ssl_init () +ssl_init (void) { + /* Becomes true if GnuTLS is initialized. */ + static bool ssl_initialized = false; + + /* GnuTLS should be initialized only once. */ + if (ssl_initialized) + return true; + const char *ca_directory; DIR *dir; @@ -101,15 +124,48 @@ ssl_init () closedir (dir); } + /* Use the private key from the cert file unless otherwise specified. */ + if (opt.cert_file && !opt.private_key) + { + opt.private_key = opt.cert_file; + opt.private_key_type = opt.cert_type; + } + /* Use the cert from the private key file unless otherwise specified. */ + if (!opt.cert_file && opt.private_key) + { + opt.cert_file = opt.private_key; + opt.cert_type = opt.private_key_type; + } + + if (opt.cert_file && opt.private_key) + { + int type; + if (opt.private_key_type != opt.cert_type) + { + /* GnuTLS can't handle this */ + logprintf (LOG_NOTQUIET, _("ERROR: GnuTLS requires the key and the \ +cert to be of the same type.\n")); + } + + type = key_type_to_gnutls_type (opt.private_key_type); + + gnutls_certificate_set_x509_key_file (credentials, opt.cert_file, + opt.private_key, + type); + } + if (opt.ca_cert) gnutls_certificate_set_x509_trust_file (credentials, opt.ca_cert, GNUTLS_X509_FMT_PEM); + + ssl_initialized = true; + return true; } struct wgnutls_transport_context { - gnutls_session session; /* GnuTLS session handle */ + gnutls_session_t session; /* GnuTLS session handle */ int last_error; /* last error returned by read/write/... */ /* Since GnuTLS doesn't support the equivalent to recv(..., @@ -132,7 +188,7 @@ wgnutls_read_timeout (int fd, char *buf, int bufsize, void *arg, double timeout) int flags = 0; #endif int ret = 0; - struct ptimer *timer; + struct ptimer *timer = NULL; struct wgnutls_transport_context *ctx = arg; int timed_out = 0; @@ -142,64 +198,56 @@ wgnutls_read_timeout (int fd, char *buf, int bufsize, void *arg, double timeout) flags = fcntl (fd, F_GETFL, 0); if (flags < 0) return flags; + if (fcntl (fd, F_SETFL, flags | O_NONBLOCK)) + return -1; +#else + /* XXX: Assume it was blocking before. */ + const int one = 1; + if (ioctl (fd, FIONBIO, &one) < 0) + return -1; #endif + timer = ptimer_new (); - if (timer == 0) + if (timer == NULL) return -1; } do { - double next_timeout = timeout - ptimer_measure (timer); - if (timeout && next_timeout < 0) - break; + double next_timeout = 0; + if (timeout) + { + next_timeout = timeout - ptimer_measure (timer); + if (next_timeout < 0) + break; + } ret = GNUTLS_E_AGAIN; if (timeout == 0 || gnutls_record_check_pending (ctx->session) || select_fd (fd, next_timeout, WAIT_FOR_READ)) { - if (timeout) - { -#ifdef F_GETFL - ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); - if (ret < 0) - return ret; -#else - /* XXX: Assume it was blocking before. */ - const int one = 1; - ret = ioctl (fd, FIONBIO, &one); - if (ret < 0) - return ret; -#endif - } - ret = gnutls_record_recv (ctx->session, buf, bufsize); - - if (timeout) - { - int status; -#ifdef F_GETFL - status = fcntl (fd, F_SETFL, flags); - if (status < 0) - return status; -#else - const int zero = 0; - status = ioctl (fd, FIONBIO, &zero); - if (status < 0) - return status; -#endif - } + timed_out = timeout && ptimer_measure (timer) >= timeout; } - - timed_out = timeout && ptimer_measure (timer) >= timeout; } while (ret == GNUTLS_E_INTERRUPTED || (ret == GNUTLS_E_AGAIN && !timed_out)); if (timeout) - ptimer_destroy (timer); + { + ptimer_destroy (timer); - if (timeout && timed_out && ret == GNUTLS_E_AGAIN) - errno = ETIMEDOUT; +#ifdef F_GETFL + if (fcntl (fd, F_SETFL, flags) < 0) + return -1; +#else + const int zero = 0; + if (ioctl (fd, FIONBIO, &zero) < 0) + return -1; +#endif + + if (timed_out && ret == GNUTLS_E_AGAIN) + errno = ETIMEDOUT; + } return ret; } @@ -207,11 +255,7 @@ wgnutls_read_timeout (int fd, char *buf, int bufsize, void *arg, double timeout) static int wgnutls_read (int fd, char *buf, int bufsize, void *arg) { -#ifdef F_GETFL - int flags = 0; -#endif int ret = 0; - struct ptimer *timer; struct wgnutls_transport_context *ctx = arg; if (ctx->peeklen) @@ -250,8 +294,12 @@ static int wgnutls_poll (int fd, double timeout, int wait_for, void *arg) { struct wgnutls_transport_context *ctx = arg; - return ctx->peeklen || gnutls_record_check_pending (ctx->session) - || select_fd (fd, timeout, wait_for); + + if (timeout) + return ctx->peeklen || gnutls_record_check_pending (ctx->session) + || select_fd (fd, timeout, wait_for); + else + return ctx->peeklen || gnutls_record_check_pending (ctx->session); } static int @@ -260,15 +308,19 @@ wgnutls_peek (int fd, char *buf, int bufsize, void *arg) int read = 0; struct wgnutls_transport_context *ctx = arg; int offset = MIN (bufsize, ctx->peeklen); - if (bufsize > sizeof ctx->peekbuf) - bufsize = sizeof ctx->peekbuf; if (ctx->peeklen) - memcpy (buf, ctx->peekbuf, offset); + { + memcpy (buf, ctx->peekbuf, offset); + return offset; + } + + if (bufsize > sizeof ctx->peekbuf) + bufsize = sizeof ctx->peekbuf; if (bufsize > offset) { - if (gnutls_record_check_pending (ctx->session) <= 0 + if (opt.read_timeout && gnutls_record_check_pending (ctx->session) == 0 && select_fd (fd, 0.0, WAIT_FOR_READ) <= 0) read = 0; else @@ -320,18 +372,26 @@ static struct transport_implementation wgnutls_transport = }; bool -ssl_connect_wget (int fd) +ssl_connect_wget (int fd, const char *hostname) { struct wgnutls_transport_context *ctx; - gnutls_session session; + gnutls_session_t session; int err; gnutls_init (&session, GNUTLS_CLIENT); + + /* We set the server name but only if it's not an IP address. */ + if (! is_valid_ip_address (hostname)) + { + gnutls_server_name_set (session, GNUTLS_NAME_DNS, hostname, + strlen (hostname)); + } + gnutls_set_default_priority (session); gnutls_credentials_set (session, GNUTLS_CRD_CERTIFICATE, credentials); #ifndef FD_TO_SOCKET # define FD_TO_SOCKET(X) (X) #endif - gnutls_transport_set_ptr (session, (gnutls_transport_ptr) FD_TO_SOCKET (fd)); + gnutls_transport_set_ptr (session, (gnutls_transport_ptr_t) FD_TO_SOCKET (fd)); err = 0; #if HAVE_GNUTLS_PRIORITY_SET_DIRECT @@ -438,8 +498,8 @@ ssl_check_certificate (int fd, const char *host) if (gnutls_certificate_type_get (ctx->session) == GNUTLS_CRT_X509) { time_t now = time (NULL); - gnutls_x509_crt cert; - const gnutls_datum *cert_list; + gnutls_x509_crt_t cert; + const gnutls_datum_t *cert_list; unsigned int cert_list_size; if ((err = gnutls_x509_crt_init (&cert)) < 0) @@ -423,14 +423,14 @@ grow_hash_table (struct hash_table *ht) table if necessary. */ void -hash_table_put (struct hash_table *ht, const void *key, void *value) +hash_table_put (struct hash_table *ht, const void *key, const void *value) { struct cell *c = find_cell (ht, key); if (CELL_OCCUPIED (c)) { /* update existing item */ c->key = (void *)key; /* const? */ - c->value = value; + c->value = (void *)value; return; } @@ -445,7 +445,7 @@ hash_table_put (struct hash_table *ht, const void *key, void *value) /* add new item */ ++ht->count; c->key = (void *)key; /* const? */ - c->value = value; + c->value = (void *)value; } /* Remove KEY->value mapping from HT. Return 0 if there was no such @@ -42,7 +42,7 @@ int hash_table_get_pair (const struct hash_table *, const void *, void *, void *); int hash_table_contains (const struct hash_table *, const void *); -void hash_table_put (struct hash_table *, const void *, void *); +void hash_table_put (struct hash_table *, const void *, const void *); int hash_table_remove (struct hash_table *, const void *); void hash_table_clear (struct hash_table *); @@ -1,6 +1,6 @@ /* Host name resolution and matching. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -914,3 +914,18 @@ host_cleanup (void) host_name_addresses_map = NULL; } } + +bool +is_valid_ip_address (const char *name) +{ + const char *endp; + + endp = name + strlen(name); + if (is_valid_ipv4_address (name, endp)) + return true; +#ifdef ENABLE_IPV6 + if (is_valid_ipv6_address (name, endp)) + return true; +#endif + return false; +} @@ -1,6 +1,6 @@ /* Declarations for host.c Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -98,6 +98,8 @@ const char *print_address (const ip_address *); bool is_valid_ipv6_address (const char *, const char *); #endif +bool is_valid_ip_address (const char *name); + bool accept_domain (struct url *); bool sufmatch (const char **, const char *); diff --git a/src/html-parse.c b/src/html-parse.c index 9fafd8f..20791cd 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -280,7 +280,7 @@ struct tagstack_item { struct tagstack_item *next; }; -struct tagstack_item * +static struct tagstack_item * tagstack_push (struct tagstack_item **head, struct tagstack_item **tail) { struct tagstack_item *ts = xmalloc(sizeof(struct tagstack_item)); @@ -301,7 +301,7 @@ tagstack_push (struct tagstack_item **head, struct tagstack_item **tail) } /* remove ts and everything after it from the stack */ -void +static void tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, struct tagstack_item *ts) { @@ -343,7 +343,7 @@ tagstack_pop (struct tagstack_item **head, struct tagstack_item **tail, } } -struct tagstack_item * +static struct tagstack_item * tagstack_find (struct tagstack_item *tail, const char *tagname_begin, const char *tagname_end) { diff --git a/src/html-url.c b/src/html-url.c index f5ab293..55563e2 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -1,6 +1,6 @@ /* Collect URLs from HTML source. Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, - 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -675,8 +675,9 @@ collect_tags_mapper (struct taginfo *tag, void *arg) check_style_attr (tag, ctx); - if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) && - tag->contents_begin && tag->contents_end) + if (tag->end_tag_p && (0 == strcasecmp (tag->name, "style")) + && tag->contents_begin && tag->contents_end + && tag->contents_begin <= tag->contents_end) { /* parse contents */ get_urls_css (ctx, tag->contents_begin - ctx->text, @@ -829,7 +830,7 @@ get_urls_file (const char *file) return head; } -void +static void cleanup_html_url (void) { /* Destroy the hash tables. The hash table keys and values are not @@ -1,6 +1,6 @@ /* HTTP support. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -58,6 +58,7 @@ as that of the covered work. */ #include "md5.h" #include "convert.h" #include "spider.h" +#include "warc.h" #ifdef TESTING #include "test.h" @@ -230,7 +231,7 @@ release_header (struct request_header *hdr) */ static void -request_set_header (struct request *req, char *name, char *value, +request_set_header (struct request *req, const char *name, const char *value, enum rp release_policy) { struct request_header *hdr; @@ -241,7 +242,7 @@ request_set_header (struct request *req, char *name, char *value, /* A NULL value is a no-op; if freeing the name is requested, free it now to avoid leaks. */ if (release_policy == rel_name || release_policy == rel_both) - xfree (name); + xfree ((void *)name); return; } @@ -252,8 +253,8 @@ request_set_header (struct request *req, char *name, char *value, { /* Replace existing header. */ release_header (hdr); - hdr->name = name; - hdr->value = value; + hdr->name = (void *)name; + hdr->value = (void *)value; hdr->release_policy = release_policy; return; } @@ -267,8 +268,8 @@ request_set_header (struct request *req, char *name, char *value, req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr)); } hdr = &req->headers[req->hcount++]; - hdr->name = name; - hdr->value = value; + hdr->name = (void *)name; + hdr->value = (void *)value; hdr->release_policy = release_policy; } @@ -295,7 +296,7 @@ request_set_user_header (struct request *req, const char *header) the header was actually removed, false otherwise. */ static bool -request_remove_header (struct request *req, char *name) +request_remove_header (struct request *req, const char *name) { int i; for (i = 0; i < req->hcount; i++) @@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name) p += A_len; \ } while (0) -/* Construct the request and write it to FD using fd_write. */ +/* Construct the request and write it to FD using fd_write. + If warc_tmp is set to a file pointer, the request string will + also be written to that file. */ static int -request_send (const struct request *req, int fd) +request_send (const struct request *req, int fd, FILE *warc_tmp) { char *request_string, *p; int i, size, write_error; @@ -374,6 +377,13 @@ request_send (const struct request *req, int fd) if (write_error < 0) logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), fd_errstr (fd)); + else if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp); + if (warc_tmp_written != size - 1) + return -2; + } return write_error; } @@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname) /* Send the contents of FILE_NAME to SOCK. Make sure that exactly PROMISED_SIZE bytes are sent over the wire -- if the file is - longer, read only that much; if the file is shorter, report an error. */ + longer, read only that much; if the file is shorter, report an error. + If warc_tmp is set to a file pointer, the post data will + also be written to that file. */ static int -post_file (int sock, const char *file_name, wgint promised_size) +post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp) { static char chunk[8192]; wgint written = 0; @@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size) fclose (fp); return -1; } + if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp); + if (warc_tmp_written != towrite) + { + fclose (fp); + return -2; + } + } written += towrite; } fclose (fp); @@ -929,9 +951,12 @@ skip_short_body (int fd, wgint contlen, bool chunked) break; remaining_chunk_size = strtol (line, &endl, 16); + xfree (line); + if (remaining_chunk_size == 0) { - fd_read_line (fd); + line = fd_read_line (fd); + xfree_null (line); break; } } @@ -956,8 +981,13 @@ skip_short_body (int fd, wgint contlen, bool chunked) { remaining_chunk_size -= ret; if (remaining_chunk_size == 0) - if (fd_read_line (fd) == NULL) - return false; + { + char *line = fd_read_line (fd); + if (line == NULL) + return false; + else + xfree (line); + } } /* Safe even if %.*s bogusly expects terminating \0 because @@ -1462,6 +1492,135 @@ File %s already there; not retrieving.\n\n"), quote (filename)); *dt |= TEXTHTML; } +/* Download the response body from the socket and writes it to + an output file. The headers have already been read from the + socket. If WARC is enabled, the response body will also be + written to a WARC response record. + + hs, contlen, contrange, chunked_transfer_encoding and url are + parameters from the gethttp method. fp is a pointer to the + output file. + + url, warc_timestamp_str, warc_request_uuid, warc_ip, type + and statcode will be saved in the headers of the WARC record. + The head parameter contains the HTTP headers of the response. + + If fp is NULL and WARC is enabled, the response body will be + written only to the WARC file. If WARC is disabled and fp + is a file pointer, the data will be written to the file. + If fp is a file pointer and WARC is enabled, the body will + be written to both destinations. + + Returns the error code. */ +static int +read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, + wgint contrange, bool chunked_transfer_encoding, + char *url, char *warc_timestamp_str, char *warc_request_uuid, + ip_address *warc_ip, char *type, int statcode, char *head) +{ + int warc_payload_offset = 0; + FILE *warc_tmp = NULL; + int warcerr = 0; + + if (opt.warc_filename != NULL) + { + /* Open a temporary file where we can write the response before we + add it to the WARC record. */ + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + warcerr = WARC_TMP_FOPENERR; + + if (warcerr == 0) + { + /* We should keep the response headers for the WARC record. */ + int head_len = strlen (head); + int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp); + if (warc_tmp_written != head_len) + warcerr = WARC_TMP_FWRITEERR; + warc_payload_offset = head_len; + } + + if (warcerr != 0) + { + if (warc_tmp != NULL) + fclose (warc_tmp); + return warcerr; + } + } + + if (fp != NULL) + { + /* This confuses the timestamping code that checks for file size. + #### The timestamping code should be smarter about file size. */ + if (opt.save_headers && hs->restval == 0) + fwrite (head, 1, strlen (head), fp); + } + + /* Read the response body. */ + int flags = 0; + if (contlen != -1) + /* If content-length is present, read that much; otherwise, read + until EOF. The HTTP spec doesn't require the server to + actually close the connection when it's done sending data. */ + flags |= rb_read_exactly; + if (fp != NULL && hs->restval > 0 && contrange == 0) + /* If the server ignored our range request, instruct fd_read_body + to skip the first RESTVAL bytes of body. */ + flags |= rb_skip_startpos; + if (chunked_transfer_encoding) + flags |= rb_chunked_transfer_encoding; + + hs->len = hs->restval; + hs->rd_size = 0; + /* Download the response body and write it to fp. + If we are working on a WARC file, we simultaneously write the + response body to warc_tmp. */ + hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, + hs->restval, &hs->rd_size, &hs->len, &hs->dltime, + flags, warc_tmp); + if (hs->res >= 0) + { + if (warc_tmp != NULL) + { + /* Create a response record and write it to the WARC file. + Note: per the WARC standard, the request and response should share + the same date header. We re-use the timestamp of the request. + The response record should also refer to the uuid of the request. */ + bool r = warc_write_response_record (url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset, + type, statcode, hs->newloc); + + /* warc_write_response_record has closed warc_tmp. */ + + if (! r) + return WARC_ERR; + } + + return RETRFINISHED; + } + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (hs->res == -2) + { + /* Error while writing to fd. */ + return FWRITEERR; + } + else if (hs->res == -3) + { + /* Error while writing to warc_tmp. */ + return WARC_TMP_FWRITEERR; + } + else + { + /* A read error! */ + hs->rderrmsg = xstrdup (fd_errstr (sock)); + return RETRFINISHED; + } +} + #define BEGINS_WITH(line, string_constant) \ (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ && (c_isspace (line[sizeof (string_constant) - 1]) \ @@ -1519,9 +1678,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, wgint contlen, contrange; struct url *conn; FILE *fp; + int err; int sock = -1; - int flags; /* Set to 1 when the authorization has already been sent and should not be tried again. */ @@ -1547,6 +1706,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, char hdrval[256]; char *message; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + char warc_timestamp_str [21]; + char warc_request_uuid [48]; + ip_address *warc_ip = NULL; + off_t warc_payload_offset = -1; + /* Whether this connection will be kept alive after the HTTP request is done. */ bool keep_alive; @@ -1792,11 +1959,17 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif &host_lookup_failed)) { + int family = socket_family (pconn.socket, ENDPOINT_PEER); sock = pconn.socket; using_ssl = pconn.ssl; - logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), - quotearg_style (escape_quoting_style, pconn.host), - pconn.port); + if (family == AF_INET6) + logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); + else + logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), + quotearg_style (escape_quoting_style, pconn.host), + pconn.port); DEBUGP (("Reusing fd %d.\n", sock)); if (pconn.authorized) /* If the connection is already authorized, the "Basic" @@ -1852,11 +2025,12 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, that the contents of Host would be exactly the same as the contents of CONNECT. */ - write_error = request_send (connreq, sock); + write_error = request_send (connreq, sock, 0); request_free (connreq); if (write_error < 0) { CLOSE_INVALIDATE (sock); + request_free (req); return WRITEFAILED; } @@ -1866,6 +2040,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"), fd_errstr (sock)); CLOSE_INVALIDATE (sock); + request_free (req); return HERR; } message = NULL; @@ -1886,6 +2061,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, quotearg_style (escape_quoting_style, _("Malformed status line"))); xfree (head); + request_free (req); return HERR; } hs->message = xstrdup (message); @@ -1897,6 +2073,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"), message ? quotearg_style (escape_quoting_style, message) : "?"); xfree_null (message); + request_free (req); return CONSSLERR; } xfree_null (message); @@ -1909,14 +2086,16 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, if (conn->scheme == SCHEME_HTTPS) { - if (!ssl_connect_wget (sock)) + if (!ssl_connect_wget (sock, u->host)) { fd_close (sock); + request_free (req); return CONSSLERR; } else if (!ssl_check_certificate (sock, u->host)) { fd_close (sock); + request_free (req); return VERIFCERTERR; } using_ssl = true; @@ -1924,8 +2103,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif /* HAVE_SSL */ } + /* Open the temporary file where we will write the request. */ + if (warc_enabled) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_TMP_FOPENERR; + } + + if (! proxy) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (sock, warc_ip, ENDPOINT_PEER); + } + } + /* Send the request to server. */ - write_error = request_send (req, sock); + write_error = request_send (req, sock, warc_tmp); if (write_error >= 0) { @@ -1933,16 +2130,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, { DEBUGP (("[POST data: %s]\n", opt.post_data)); write_error = fd_write (sock, opt.post_data, post_data_size, -1); + if (write_error >= 0 && warc_tmp != NULL) + { + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp); + if (warc_tmp_written != post_data_size) + write_error = -2; + } } else if (opt.post_file_name && post_data_size != 0) - write_error = post_file (sock, opt.post_file_name, post_data_size); + { + if (warc_tmp != NULL) + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftello (warc_tmp); + + write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp); + } } if (write_error < 0) { CLOSE_INVALIDATE (sock); request_free (req); - return WRITEFAILED; + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (write_error == -2) + return WARC_TMP_FWRITEERR; + else + return WRITEFAILED; } logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), proxy ? "Proxy" : "HTTP"); @@ -1950,6 +2170,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, contrange = 0; *dt &= ~RETROKF; + + if (warc_enabled) + { + bool warc_result; + /* Generate a timestamp and uuid for this request. */ + warc_timestamp (warc_timestamp_str); + warc_uuid_str (warc_request_uuid); + + /* Create a request record and store it in the WARC file. */ + warc_result = warc_write_request_record (u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset); + if (! warc_result) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_ERR; + } + + /* warc_write_request_record has also closed warc_tmp. */ + } + + read_header: head = read_http_response_head (sock); if (!head) @@ -1985,6 +2228,7 @@ read_header: quotearg_style (escape_quoting_style, _("Malformed status line"))); CLOSE_INVALIDATE (sock); + resp_free (resp); request_free (req); xfree (head); return HERR; @@ -1993,6 +2237,7 @@ read_header: if (H_10X (statcode)) { DEBUGP (("Ignoring response\n")); + resp_free (resp); xfree (head); goto read_header; } @@ -2073,11 +2318,42 @@ read_header: if (statcode == HTTP_STATUS_UNAUTHORIZED) { /* Authorization is required. */ - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* Normally we are not interested in the response body. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err; + type = resp_header_strdup (resp, "Content-Type"); + err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + xfree_null (type); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + request_free (req); + xfree_null (message); + resp_free (resp); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + pconn.authorized = false; if (!auth_finished && (user && passwd)) { @@ -2182,6 +2458,8 @@ read_header: retrieve the file. But if the output_document was given, then this test was already done and the file didn't exist. Hence the !opt.output_document */ get_file_flags (hs->local_file, dt); + request_free (req); + resp_free (resp); xfree (head); xfree_null (message); return RETRUNNEEDED; @@ -2325,11 +2603,42 @@ read_header: _("Location: %s%s\n"), hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), hs->newloc ? _(" [following]") : ""); - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* In case the caller cares to look... */ + hs->len = 0; + hs->res = 0; + hs->restval = 0; + + /* Normally we are not interested in the response body of a redirect. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree_null (type); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree_null (type); xfree (head); /* From RFC2616: The status codes 303 and 307 have @@ -2447,30 +2756,55 @@ read_header: logputs (LOG_VERBOSE, "\n"); } } - xfree_null (type); - type = NULL; /* We don't need it any more. */ /* Return if we have no intention of further downloading. */ - if (!(*dt & RETROKF) || head_only) + if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only) { /* In case the caller cares to look... */ hs->len = 0; hs->res = 0; - xfree_null (type); - if (head_only) - /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the - servers not to send body in response to a HEAD request, and - those that do will likely be caught by test_socket_open. - If not, they can be worked around using - `--no-http-keep-alive'. */ - CLOSE_FINISH (sock); - else if (keep_alive - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - /* Successfully skipped the body; also keep using the socket. */ - CLOSE_FINISH (sock); + hs->restval = 0; + + /* Normally we are not interested in the response body of a error responses. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree (head); + xfree_null (type); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (head_only) + /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the + servers not to send body in response to a HEAD request, and + those that do will likely be caught by test_socket_open. + If not, they can be worked around using + `--no-http-keep-alive'. */ + CLOSE_FINISH (sock); + else if (keep_alive + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + /* Successfully skipped the body; also keep using the socket. */ + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree (head); + xfree_null (type); return RETRFINISHED; } @@ -2512,6 +2846,7 @@ read_header: strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return UNLINKERR; } } @@ -2539,6 +2874,7 @@ read_header: hs->local_file); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPEN_EXCL_ERR; } } @@ -2547,6 +2883,7 @@ read_header: logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPENERR; } } @@ -2560,49 +2897,26 @@ read_header: HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); } - /* This confuses the timestamping code that checks for file size. - #### The timestamping code should be smarter about file size. */ - if (opt.save_headers && hs->restval == 0) - fwrite (head, 1, strlen (head), fp); + + err = read_response_body (hs, sock, fp, contlen, contrange, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); /* Now we no longer need to store the response header. */ xfree (head); - - /* Download the request body. */ - flags = 0; - if (contlen != -1) - /* If content-length is present, read that much; otherwise, read - until EOF. The HTTP spec doesn't require the server to - actually close the connection when it's done sending data. */ - flags |= rb_read_exactly; - if (hs->restval > 0 && contrange == 0) - /* If the server ignored our range request, instruct fd_read_body - to skip the first RESTVAL bytes of body. */ - flags |= rb_skip_startpos; - - if (chunked_transfer_encoding) - flags |= rb_chunked_transfer_encoding; - - hs->len = hs->restval; - hs->rd_size = 0; - hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, - hs->restval, &hs->rd_size, &hs->len, &hs->dltime, - flags); + xfree_null (type); if (hs->res >= 0) CLOSE_FINISH (sock); else - { - if (hs->res < 0) - hs->rderrmsg = xstrdup (fd_errstr (sock)); - CLOSE_INVALIDATE (sock); - } + CLOSE_INVALIDATE (sock); if (!output_stream) fclose (fp); - if (hs->res == -2) - return FWRITEERR; - return RETRFINISHED; + + return err; } /* The genuine HTTP loop! This is the part where the retrieval is @@ -2626,6 +2940,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc, char *file_name; bool force_full_retrieve = false; + + /* If we are writing to a WARC file: always retrieve the whole file. */ + if (opt.warc_filename != NULL) + force_full_retrieve = true; + + /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); @@ -2795,6 +3115,18 @@ Spider mode enabled. Check if remote file exists.\n")); /* Fatal errors just return from the function. */ ret = err; goto exit; + case WARC_ERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to WARC file.\n")); + ret = err; + goto exit; + case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n")); + ret = err; + goto exit; case CONSSLERR: /* Another fatal error. */ logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n")); @@ -3323,19 +3655,23 @@ digest_authentication_encode (const char *au, const char *user, const char *passwd, const char *method, const char *path) { - static char *realm, *opaque, *nonce; + static char *realm, *opaque, *nonce, *qop; static struct { const char *name; char **variable; } options[] = { { "realm", &realm }, { "opaque", &opaque }, - { "nonce", &nonce } + { "nonce", &nonce }, + { "qop", &qop } }; + char cnonce[16] = ""; char *res; + size_t res_size; param_token name, value; - realm = opaque = nonce = NULL; + + realm = opaque = nonce = qop = NULL; au += 6; /* skip over `Digest' */ while (extract_param (&au, &name, &value, ',')) @@ -3351,11 +3687,19 @@ digest_authentication_encode (const char *au, const char *user, break; } } + + if (qop != NULL && strcmp(qop,"auth")) + { + logprintf (LOG_NOTQUIET, _("Unsupported quality of protection '%s'.\n"), qop); + user = NULL; /* force freeing mem and return */ + } + if (!realm || !nonce || !user || !passwd || !path || !method) { xfree_null (realm); xfree_null (opaque); xfree_null (nonce); + xfree_null (qop); return NULL; } @@ -3384,27 +3728,69 @@ digest_authentication_encode (const char *au, const char *user, md5_finish_ctx (&ctx, hash); dump_hash (a2buf, hash); - /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ - md5_init_ctx (&ctx); - md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); - md5_process_bytes ((unsigned char *)":", 1, &ctx); - md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); - md5_process_bytes ((unsigned char *)":", 1, &ctx); - md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); - md5_finish_ctx (&ctx, hash); + if (!strcmp(qop,"auth")) + { + /* RFC 2617 Digest Access Authentication */ + /* generate random hex string */ + snprintf(cnonce, sizeof(cnonce), "%08x", random_number(INT_MAX)); + + /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" noncecount ":" clientnonce ":" qop ": " A2BUF) */ + md5_init_ctx (&ctx); + md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)"00000001", 8, &ctx); /* TODO: keep track of server nonce values */ + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)cnonce, strlen(cnonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)qop, strlen(qop), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_finish_ctx (&ctx, hash); + } + else + { + /* RFC 2069 Digest Access Authentication */ + /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ + md5_init_ctx (&ctx); + md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); + md5_process_bytes ((unsigned char *)":", 1, &ctx); + md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); + md5_finish_ctx (&ctx, hash); + } + dump_hash (response_digest, hash); - res = xmalloc (strlen (user) - + strlen (user) - + strlen (realm) - + strlen (nonce) - + strlen (path) - + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ - + (opaque ? strlen (opaque) : 0) - + 128); - sprintf (res, "Digest \ -username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", - user, realm, nonce, path, response_digest); + res_size = strlen (user) + + strlen (user) + + strlen (realm) + + strlen (nonce) + + strlen (path) + + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ + + (opaque ? strlen (opaque) : 0) + + (qop ? 128: 0) + + 128; + + res = xmalloc (res_size); + + if (!strcmp(qop,"auth")) + { + snprintf (res, res_size, "Digest "\ + "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\""\ + ", qop=auth, nc=00000001, cnonce=\"%s\"", + user, realm, nonce, path, response_digest, cnonce); + + } + else + { + snprintf (res, res_size, "Digest "\ + "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", + user, realm, nonce, path, response_digest); + } + if (opaque) { char *p = res + strlen (res); @@ -1,6 +1,6 @@ /* Reading/parsing the initialization file. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -30,6 +30,7 @@ shall include the source code for the parts of OpenSSL used as well as that of the covered work. */ #include "wget.h" +#include "exits.h" #include <stdio.h> #include <stdlib.h> @@ -46,6 +47,10 @@ as that of the covered work. */ # endif #endif +#include <regex.h> +#ifdef HAVE_LIBPCRE +# include <pcre.h> +#endif #ifdef HAVE_PWD_H # include <pwd.h> @@ -62,6 +67,7 @@ as that of the covered work. */ #include "res.h" /* for res_cleanup */ #include "http.h" /* for http_cleanup */ #include "retr.h" /* for output_stream */ +#include "warc.h" /* for warc_close */ #ifdef TESTING #include "test.h" @@ -88,12 +94,15 @@ CMD_DECLARE (cmd_vector); CMD_DECLARE (cmd_spec_dirstruct); CMD_DECLARE (cmd_spec_header); +CMD_DECLARE (cmd_spec_warc_header); CMD_DECLARE (cmd_spec_htmlify); CMD_DECLARE (cmd_spec_mirror); CMD_DECLARE (cmd_spec_prefer_family); CMD_DECLARE (cmd_spec_progress); CMD_DECLARE (cmd_spec_recursive); +CMD_DECLARE (cmd_spec_regex_type); CMD_DECLARE (cmd_spec_restrict_file_names); +CMD_DECLARE (cmd_spec_report_speed); #ifdef HAVE_SSL CMD_DECLARE (cmd_spec_secure_protocol); #endif @@ -115,6 +124,7 @@ static const struct { } commands[] = { /* KEEP THIS LIST ALPHABETICALLY SORTED */ { "accept", &opt.accepts, cmd_vector }, + { "acceptregex", &opt.acceptregex_s, cmd_string }, { "addhostdir", &opt.add_hostdir, cmd_boolean }, { "adjustextension", &opt.adjust_extension, cmd_boolean }, { "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */ @@ -139,6 +149,7 @@ static const struct { { "chooseconfig", &opt.choose_config, cmd_file }, { "connecttimeout", &opt.connect_timeout, cmd_time }, { "contentdisposition", &opt.content_disposition, cmd_boolean }, + { "contentonerror", &opt.content_on_error, cmd_boolean }, { "continue", &opt.always_rest, cmd_boolean }, { "convertlinks", &opt.convert_links, cmd_boolean }, { "cookies", &opt.cookies, cmd_boolean }, @@ -213,7 +224,7 @@ static const struct { { "postdata", &opt.post_data, cmd_string }, { "postfile", &opt.post_file_name, cmd_file }, { "preferfamily", NULL, cmd_spec_prefer_family }, - { "preservepermissions", &opt.preserve_perm, cmd_boolean },/* deprecated */ + { "preservepermissions", &opt.preserve_perm, cmd_boolean }, #ifdef HAVE_SSL { "privatekey", &opt.private_key, cmd_file }, { "privatekeytype", &opt.private_key_type, cmd_cert_type }, @@ -233,10 +244,13 @@ static const struct { { "reclevel", &opt.reclevel, cmd_number_inf }, { "recursive", NULL, cmd_spec_recursive }, { "referer", &opt.referer, cmd_string }, + { "regextype", &opt.regex_type, cmd_spec_regex_type }, { "reject", &opt.rejects, cmd_vector }, + { "rejectregex", &opt.rejectregex_s, cmd_string }, { "relativeonly", &opt.relative_only, cmd_boolean }, { "remoteencoding", &opt.encoding_remote, cmd_string }, { "removelisting", &opt.remove_listing, cmd_boolean }, + { "reportspeed", &opt.report_bps, cmd_spec_report_speed}, { "restrictfilenames", NULL, cmd_spec_restrict_file_names }, { "retrsymlinks", &opt.retr_symlinks, cmd_boolean }, { "retryconnrefused", &opt.retry_connrefused, cmd_boolean }, @@ -263,6 +277,17 @@ static const struct { { "verbose", NULL, cmd_spec_verbose }, { "wait", &opt.wait, cmd_time }, { "waitretry", &opt.waitretry, cmd_time }, + { "warccdx", &opt.warc_cdx_enabled, cmd_boolean }, + { "warccdxdedup", &opt.warc_cdx_dedup_filename, cmd_file }, +#ifdef HAVE_LIBZ + { "warccompression", &opt.warc_compression_enabled, cmd_boolean }, +#endif + { "warcdigests", &opt.warc_digests_enabled, cmd_boolean }, + { "warcfile", &opt.warc_filename, cmd_file }, + { "warcheader", NULL, cmd_spec_warc_header }, + { "warckeeplog", &opt.warc_keep_log, cmd_boolean }, + { "warcmaxsize", &opt.warc_maxsize, cmd_bytes }, + { "warctempdir", &opt.warc_tempdir, cmd_directory }, #ifdef USE_WATT32 { "wdebug", &opt.wdebug, cmd_boolean }, #endif @@ -347,6 +372,8 @@ defaults (void) opt.restrict_files_nonascii = false; opt.restrict_files_case = restrict_no_case_restriction; + opt.regex_type = regex_type_posix; + opt.max_redirect = 20; opt.waitretry = 10; @@ -361,6 +388,18 @@ defaults (void) opt.useservertimestamps = true; opt.show_all_dns_entries = false; + + opt.warc_maxsize = 0; /* 1024 * 1024 * 1024; */ +#ifdef HAVE_LIBZ + opt.warc_compression_enabled = true; +#else + opt.warc_compression_enabled = false; +#endif + opt.warc_digests_enabled = true; + opt.warc_cdx_enabled = false; + opt.warc_cdx_dedup_filename = NULL; + opt.warc_tempdir = NULL; + opt.warc_keep_log = true; } /* Return the user's home directory (strdup-ed), or NULL if none is @@ -443,7 +482,7 @@ wgetrc_env_file_name (void) return NULL; } -/* Check for the existance of '$HOME/.wgetrc' and return it's path +/* Check for the existance of '$HOME/.wgetrc' and return its path if it exists and is set. */ char * wgetrc_user_file_name (void) @@ -597,21 +636,34 @@ initialize (void) variable has been set. For internal testing purposes only! */ env_sysrc = getenv ("SYSTEM_WGETRC"); if (env_sysrc && file_exists_p (env_sysrc)) - ok &= run_wgetrc (env_sysrc); + { + ok &= run_wgetrc (env_sysrc); + /* If there are any problems parsing the system wgetrc file, tell + the user and exit */ + if (! ok) + { + fprintf (stderr, _("\ +Parsing system wgetrc file (env SYSTEM_WGETRC) failed. Please check\n\ +'%s',\n\ +or specify a different file using --config.\n"), env_sysrc); + exit (2); + } + } /* Otherwise, if SYSTEM_WGETRC is defined, use it. */ #ifdef SYSTEM_WGETRC else if (file_exists_p (SYSTEM_WGETRC)) ok &= run_wgetrc (SYSTEM_WGETRC); -#endif /* If there are any problems parsing the system wgetrc file, tell the user and exit */ if (! ok) { fprintf (stderr, _("\ -Parsing system wgetrc file failed, please check '%s'. \ -Or specify a different file using --config\n"), SYSTEM_WGETRC); +Parsing system wgetrc file failed. Please check\n\ +'%s',\n\ +or specify a different file using --config.\n"), SYSTEM_WGETRC); exit (2); } +#endif /* Override it with your own, if one exists. */ file = wgetrc_file_name (); if (!file) @@ -1222,6 +1274,27 @@ cmd_spec_header (const char *com, const char *val, void *place_ignored) } static bool +cmd_spec_warc_header (const char *com, const char *val, void *place_ignored) +{ + /* Empty value means reset the list of headers. */ + if (*val == '\0') + { + free_vec (opt.warc_user_headers); + opt.warc_user_headers = NULL; + return true; + } + + if (!check_user_specified_header (val)) + { + fprintf (stderr, _("%s: %s: Invalid WARC header %s.\n"), + exec_name, com, quote (val)); + return false; + } + opt.warc_user_headers = vec_append (opt.warc_user_headers, val); + return true; +} + +static bool cmd_spec_htmlify (const char *com, const char *val, void *place_ignored) { int flag = cmd_boolean (com, val, &opt.htmlify); @@ -1308,6 +1381,25 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored) return true; } +/* Validate --regex-type and set the choice. */ + +static bool +cmd_spec_regex_type (const char *com, const char *val, void *place_ignored) +{ + static const struct decode_item choices[] = { + { "posix", regex_type_posix }, +#ifdef HAVE_LIBPCRE + { "pcre", regex_type_pcre }, +#endif + }; + int regex_type = regex_type_posix; + int ok = decode_string (val, choices, countof (choices), ®ex_type); + if (!ok) + fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val)); + opt.regex_type = regex_type; + return ok; +} + static bool cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored) { @@ -1362,6 +1454,15 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno return true; } +static bool +cmd_spec_report_speed (const char *com, const char *val, void *place_ignored) +{ + opt.report_bps = strcasecmp (val, "bits") == 0; + if (!opt.report_bps) + fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val)); + return opt.report_bps; +} + #ifdef HAVE_SSL static bool cmd_spec_secure_protocol (const char *com, const char *val, void *place) @@ -1576,8 +1677,16 @@ cleanup (void) { /* Free external resources, close files, etc. */ + /* Close WARC file. */ + if (opt.warc_filename != 0) + warc_close (); + + log_close (); + if (output_stream) - fclose (output_stream); + if (fclose (output_stream) == EOF) + inform_exit_status (CLOSEFAILED); + /* No need to check for error because Wget flushes its output (and checks for errors) after any data arrives. */ @@ -1597,6 +1706,9 @@ cleanup (void) host_cleanup (); log_cleanup (); + for (i = 0; i < nurl; i++) + xfree (url[i]); + { extern acc_t *netrc_list; free_netrc (netrc_list); @@ -1625,6 +1737,7 @@ cleanup (void) xfree_null (opt.http_user); xfree_null (opt.http_passwd); free_vec (opt.user_headers); + free_vec (opt.warc_user_headers); # ifdef HAVE_SSL xfree_null (opt.cert_file); xfree_null (opt.private_key); @@ -79,6 +79,10 @@ as that of the covered work. */ logging is inhibited, logfp is set back to NULL. */ static FILE *logfp; +/* A second file descriptor pointing to the temporary log file for the + WARC writer. If WARC writing is disabled, this is NULL. */ +static FILE *warclogfp; + /* If true, it means logging is inhibited, i.e. nothing is printed or stored. */ static bool inhibit_logging; @@ -304,6 +308,31 @@ get_log_fp (void) return logfp; return stderr; } + +/* Returns the file descriptor for the secondary log file. This is + WARCLOGFP, except if called before log_init, in which case it + returns stderr. This is useful in case someone calls a logging + function before log_init. + + If logging is inhibited, return NULL. */ + +static FILE * +get_warc_log_fp (void) +{ + if (inhibit_logging) + return NULL; + if (warclogfp) + return warclogfp; + return NULL; +} + +/* Sets the file descriptor for the secondary log file. */ + +void +log_set_warc_log_fp (FILE * fp) +{ + warclogfp = fp; +} /* Log a literal string S. The string is logged as-is, without a newline appended. */ @@ -312,13 +341,17 @@ void logputs (enum log_options o, const char *s) { FILE *fp; + FILE *warcfp; check_redirect_output (); if ((fp = get_log_fp ()) == NULL) return; + warcfp = get_warc_log_fp (); CHECK_VERBOSE (o); FPUTS (s, fp); + if (warcfp != NULL) + FPUTS (s, warcfp); if (save_context_p) saved_append (s); if (flush_log_p) @@ -356,8 +389,9 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt, int available_size = sizeof (smallmsg); int numwritten; FILE *fp = get_log_fp (); + FILE *warcfp = get_warc_log_fp (); - if (!save_context_p) + if (!save_context_p && warcfp == NULL) { /* In the simple case just call vfprintf(), to avoid needless allocation and games with vsnprintf(). */ @@ -407,8 +441,11 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt, } /* Writing succeeded. */ - saved_append (write_ptr); + if (save_context_p) + saved_append (write_ptr); FPUTS (write_ptr, fp); + if (warcfp != NULL) + FPUTS (write_ptr, warcfp); if (state->bigmsg) xfree (state->bigmsg); @@ -426,6 +463,7 @@ void logflush (void) { FILE *fp = get_log_fp (); + FILE *warcfp = get_warc_log_fp (); if (fp) { /* 2005-10-25 SMS. @@ -440,6 +478,10 @@ logflush (void) fflush (fp); #endif /* def __VMS [else] */ } + + if (warcfp != NULL) + fflush (warcfp); + needs_flushing = false; } @@ -573,14 +615,14 @@ log_init (const char *file, bool appendp) } } -/* Close LOGFP, inhibit further logging and free the memory associated - with it. */ +/* Close LOGFP (only if we opened it, not if it's stderr), inhibit + further logging and free the memory associated with it. */ void log_close (void) { int i; - if (logfp) + if (logfp && (logfp != stderr)) fclose (logfp); logfp = NULL; inhibit_logging = true; @@ -598,6 +640,7 @@ log_dump_context (void) { int num = log_line_current; FILE *fp = get_log_fp (); + FILE *warcfp = get_warc_log_fp (); if (!fp) return; @@ -609,14 +652,23 @@ log_dump_context (void) { struct log_ln *ln = log_lines + num; if (ln->content) - FPUTS (ln->content, fp); + { + FPUTS (ln->content, fp); + if (warcfp != NULL) + FPUTS (ln->content, warcfp); + } ROT_ADVANCE (num); } while (num != log_line_current); if (trailing_line) if (log_lines[log_line_current].content) - FPUTS (log_lines[log_line_current].content, fp); + { + FPUTS (log_lines[log_line_current].content, fp); + if (warcfp != NULL) + FPUTS (log_lines[log_line_current].content, warcfp); + } fflush (fp); + fflush (warcfp); } /* String escape functions. */ @@ -34,8 +34,12 @@ as that of the covered work. */ /* The log file to which Wget writes to after HUP. */ #define DEFAULT_LOGFILE "wget-log" +#include <stdio.h> + enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS }; +void log_set_warc_log_fp (FILE *); + void logprintf (enum log_options, const char *, ...) GCC_FORMAT_ATTR (2, 3); void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2); @@ -1,6 +1,6 @@ /* Command line parsing. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, - 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, + 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This file is part of GNU Wget. @@ -55,7 +55,7 @@ as that of the covered work. */ #include "spider.h" #include "http.h" /* for save_cookies */ #include "ptimer.h" - +#include "warc.h" #include <getopt.h> #include <getpass.h> #include <quote.h> @@ -157,6 +157,7 @@ struct cmdline_option { static struct cmdline_option option_data[] = { { "accept", 'A', OPT_VALUE, "accept", -1 }, + { "accept-regex", 0, OPT_VALUE, "acceptregex", -1 }, { "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 }, { "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument }, { "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 }, @@ -178,6 +179,7 @@ static struct cmdline_option option_data[] = { "continue", 'c', OPT_BOOLEAN, "continue", -1 }, { "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 }, { "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 }, + { "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 }, { "cookies", 0, OPT_BOOLEAN, "cookies", -1 }, { "cut-dirs", 0, OPT_VALUE, "cutdirs", -1 }, { WHEN_DEBUG ("debug"), 'd', OPT_BOOLEAN, "debug", -1 }, @@ -242,7 +244,7 @@ static struct cmdline_option option_data[] = { "post-data", 0, OPT_VALUE, "postdata", -1 }, { "post-file", 0, OPT_VALUE, "postfile", -1 }, { "prefer-family", 0, OPT_VALUE, "preferfamily", -1 }, - { "preserve-permissions", 0, OPT_BOOLEAN, "preservepermissions", -1 }, /* deprecated */ + { "preserve-permissions", 0, OPT_BOOLEAN, "preservepermissions", -1 }, { IF_SSL ("private-key"), 0, OPT_VALUE, "privatekey", -1 }, { IF_SSL ("private-key-type"), 0, OPT_VALUE, "privatekeytype", -1 }, { "progress", 0, OPT_VALUE, "progress", -1 }, @@ -259,10 +261,13 @@ static struct cmdline_option option_data[] = { "read-timeout", 0, OPT_VALUE, "readtimeout", -1 }, { "recursive", 'r', OPT_BOOLEAN, "recursive", -1 }, { "referer", 0, OPT_VALUE, "referer", -1 }, + { "regex-type", 0, OPT_VALUE, "regextype", -1 }, { "reject", 'R', OPT_VALUE, "reject", -1 }, + { "reject-regex", 0, OPT_VALUE, "rejectregex", -1 }, { "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 }, { "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 }, { "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 }, + { "report-speed", 0, OPT_BOOLEAN, "reportspeed", -1 }, { "restrict-file-names", 0, OPT_BOOLEAN, "restrictfilenames", -1 }, { "retr-symlinks", 0, OPT_BOOLEAN, "retrsymlinks", -1 }, { "retry-connrefused", 0, OPT_BOOLEAN, "retryconnrefused", -1 }, @@ -286,6 +291,17 @@ static struct cmdline_option option_data[] = { "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument }, { "wait", 'w', OPT_VALUE, "wait", -1 }, { "waitretry", 0, OPT_VALUE, "waitretry", -1 }, + { "warc-cdx", 0, OPT_BOOLEAN, "warccdx", -1 }, +#ifdef HAVE_LIBZ + { "warc-compression", 0, OPT_BOOLEAN, "warccompression", -1 }, +#endif + { "warc-dedup", 0, OPT_VALUE, "warccdxdedup", -1 }, + { "warc-digests", 0, OPT_BOOLEAN, "warcdigests", -1 }, + { "warc-file", 0, OPT_VALUE, "warcfile", -1 }, + { "warc-header", 0, OPT_VALUE, "warcheader", -1 }, + { "warc-keep-log", 0, OPT_BOOLEAN, "warckeeplog", -1 }, + { "warc-max-size", 0, OPT_VALUE, "warcmaxsize", -1 }, + { "warc-tempdir", 0, OPT_VALUE, "warctempdir", -1 }, #ifdef USE_WATT32 { "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 }, #endif @@ -444,6 +460,8 @@ Logging and input file:\n"), N_("\ -nv, --no-verbose turn off verboseness, without being quiet.\n"), N_("\ + --report-speed=TYPE Output bandwidth as TYPE. TYPE can be bits.\n"), + N_("\ -i, --input-file=FILE download URLs found in local or external FILE.\n"), N_("\ -F, --force-html treat input file as HTML.\n"), @@ -595,6 +613,8 @@ HTTP options:\n"), --content-disposition honor the Content-Disposition header when\n\ choosing local file names (EXPERIMENTAL).\n"), N_("\ + --content-on-error output the received content on server errors.\n"), + N_("\ --auth-no-challenge send Basic HTTP authentication information\n\ without first waiting for the server's\n\ challenge.\n"), @@ -644,10 +664,37 @@ FTP options:\n"), N_("\ --no-passive-ftp disable the \"passive\" transfer mode.\n"), N_("\ + --preserve-permissions preserve remote file permissions.\n"), + N_("\ --retr-symlinks when recursing, get linked-to files (not dir).\n"), "\n", N_("\ +WARC options:\n"), + N_("\ + --warc-file=FILENAME save request/response data to a .warc.gz file.\n"), + N_("\ + --warc-header=STRING insert STRING into the warcinfo record.\n"), + N_("\ + --warc-max-size=NUMBER set maximum size of WARC files to NUMBER.\n"), + N_("\ + --warc-cdx write CDX index files.\n"), + N_("\ + --warc-dedup=FILENAME do not store records listed in this CDX file.\n"), +#ifdef HAVE_LIBZ + N_("\ + --no-warc-compression do not compress WARC files with GZIP.\n"), +#endif + N_("\ + --no-warc-digests do not calculate SHA1 digests.\n"), + N_("\ + --no-warc-keep-log do not store the log file in a WARC record.\n"), + N_("\ + --warc-tempdir=DIRECTORY location for temporary files created by the\n\ + WARC writer.\n"), + "\n", + + N_("\ Recursive download:\n"), N_("\ -r, --recursive specify recursive download.\n"), @@ -680,6 +727,17 @@ Recursive accept/reject:\n"), N_("\ -R, --reject=LIST comma-separated list of rejected extensions.\n"), N_("\ + --accept-regex=REGEX regex matching accepted URLs.\n"), + N_("\ + --reject-regex=REGEX regex matching rejected URLs.\n"), +#ifdef HAVE_LIBPCRE + N_("\ + --regex-type=TYPE regex type (posix|pcre).\n"), +#else + N_("\ + --regex-type=TYPE regex type (posix).\n"), +#endif + N_("\ -D, --domains=LIST comma-separated list of accepted domains.\n"), N_("\ --exclude-domains=LIST comma-separated list of rejected domains.\n"), @@ -703,7 +761,6 @@ Recursive accept/reject:\n"), N_("\ -np, --no-parent don't ascend to the parent directory.\n"), "\n", - N_("Mail bug reports and suggestions to <bug-wget@gnu.org>.\n") }; @@ -882,9 +939,9 @@ print_version (void) exit (3); /* TRANSLATORS: When available, an actual copyright character - (cirle-c) should be used in preference to "(C)". */ + (circle-c) should be used in preference to "(C)". */ if (fputs (_("\ -Copyright (C) 2009 Free Software Foundation, Inc.\n"), stdout) < 0) +Copyright (C) 2011 Free Software Foundation, Inc.\n"), stdout) < 0) exit (3); if (fputs (_("\ License GPLv3+: GNU GPL version 3 or later\n\ @@ -905,6 +962,7 @@ There is NO WARRANTY, to the extent permitted by law.\n"), stdout) < 0) } char *program_name; /* Needed by lib/error.c. */ +char *program_argstring; /* Needed by wget_warc.c. */ int main (int argc, char **argv) @@ -940,13 +998,34 @@ main (int argc, char **argv) windows_main ((char **) &exec_name); #endif + /* Construct the arguments string. */ + int argstring_length = 1; + for (i = 1; i < argc; i++) + argstring_length += strlen (argv[i]) + 2 + 1; + char *p = program_argstring = malloc (argstring_length * sizeof (char)); + if (p == NULL) + { + fprintf (stderr, _("Memory allocation problem\n")); + exit (2); + } + for (i = 1; i < argc; i++) + { + *p++ = '"'; + int arglen = strlen (argv[i]); + memcpy (p, argv[i], arglen); + p += arglen; + *p++ = '"'; + *p++ = ' '; + } + *p = '\0'; + /* Load the hard-coded defaults. */ defaults (); init_switches (); - /* This seperate getopt_long is needed to find the user config - and parse it before the other user options. */ + /* This separate getopt_long is needed to find the user config file + option ("--config") and parse it before the other user options. */ longindex = -1; int retconf; bool use_userconfig = false; @@ -957,20 +1036,25 @@ main (int argc, char **argv) int confval; bool userrc_ret = true; struct cmdline_option *config_opt; - confval = long_options[longindex].val; - config_opt = &option_data[confval & ~BOOLEAN_NEG_MARKER]; - if (strcmp (config_opt->long_name, "config") == 0) - { - userrc_ret &= run_wgetrc (optarg); - use_userconfig = true; - } - if (!userrc_ret) + + /* There is no short option for "--config". */ + if (longindex >= 0) { - printf ("Exiting due to error in %s\n", optarg); - exit (2); + confval = long_options[longindex].val; + config_opt = &option_data[confval & ~BOOLEAN_NEG_MARKER]; + if (strcmp (config_opt->long_name, "config") == 0) + { + userrc_ret &= run_wgetrc (optarg); + use_userconfig = true; + } + if (!userrc_ret) + { + fprintf (stderr, "Exiting due to error in %s\n", optarg); + exit (2); + } + else + break; } - else - break; } /* If the user did not specify a config, read the system wgetrc and ~/.wgetrc. */ @@ -993,9 +1077,10 @@ main (int argc, char **argv) { if (ret == '?') { - print_usage (0); - printf ("\n"); - printf (_("Try `%s --help' for more options.\n"), exec_name); + print_usage (1); + fprintf (stderr, "\n"); + fprintf (stderr, _("Try `%s --help' for more options.\n"), + exec_name); exit (2); } /* Find the short option character in the mapping. */ @@ -1103,7 +1188,7 @@ main (int argc, char **argv) { fprintf (stderr, _("Both --no-clobber and --convert-links were specified," - "only --convert-links will be used.\n")); + " only --convert-links will be used.\n")); opt.noclobber = false; } @@ -1184,6 +1269,47 @@ for details.\n\n")); } } + if (opt.warc_filename != 0) + { + if (opt.noclobber) + { + fprintf (stderr, + _("WARC output does not work with --no-clobber, " + "--no-clobber will be disabled.\n")); + opt.noclobber = false; + } + if (opt.timestamping) + { + fprintf (stderr, + _("WARC output does not work with timestamping, " + "timestamping will be disabled.\n")); + opt.timestamping = false; + } + if (opt.spider) + { + fprintf (stderr, + _("WARC output does not work with --spider.\n")); + exit (1); + } + if (opt.always_rest) + { + fprintf (stderr, + _("WARC output does not work with --continue, " + "--continue will be disabled.\n")); + opt.always_rest = false; + } + if (opt.warc_cdx_dedup_filename != 0 && !opt.warc_digests_enabled) + { + fprintf (stderr, + _("Digests are disabled; WARC deduplication will " + "not find duplicate records.\n")); + } + if (opt.warc_keep_log) + { + opt.progress_type = xstrdup ("dot"); + } + } + if (opt.ask_passwd && opt.passwd) { fprintf (stderr, @@ -1197,13 +1323,42 @@ for details.\n\n")); /* No URL specified. */ fprintf (stderr, _("%s: missing URL\n"), exec_name); print_usage (1); - printf ("\n"); + fprintf (stderr, "\n"); /* #### Something nicer should be printed here -- similar to the pre-1.5 `--help' page. */ fprintf (stderr, _("Try `%s --help' for more options.\n"), exec_name); exit (1); } + /* Compile the regular expressions. */ + switch (opt.regex_type) + { +#ifdef HAVE_LIBPCRE + case regex_type_pcre: + opt.regex_compile_fun = compile_pcre_regex; + opt.regex_match_fun = match_pcre_regex; + break; +#endif + + case regex_type_posix: + default: + opt.regex_compile_fun = compile_posix_regex; + opt.regex_match_fun = match_posix_regex; + break; + } + if (opt.acceptregex_s) + { + opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s); + if (!opt.acceptregex) + exit (1); + } + if (opt.rejectregex_s) + { + opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s); + if (!opt.rejectregex) + exit (1); + } + #ifdef ENABLE_IRI if (opt.enable_iri) { @@ -1250,6 +1405,11 @@ for details.\n\n")); /* Fill in the arguments. */ url = alloca_array (char *, nurl + 1); + if (url == NULL) + { + fprintf (stderr, _("Memory allocation problem\n")); + exit (2); + } for (i = 0; i < nurl; i++, optind++) { char *rewritten = rewrite_shorthand_url (argv[optind]); @@ -1263,6 +1423,10 @@ for details.\n\n")); /* Initialize logging. */ log_init (opt.lfilename, append_to_log); + /* Open WARC file. */ + if (opt.warc_filename != 0) + warc_init (); + DEBUGP (("DEBUG output created by Wget %s on %s.\n\n", version_string, OS_TYPE)); @@ -1395,7 +1559,7 @@ outputting to a regular file.\n")); &dt, opt.recursive, iri, true); } - if (opt.delete_after && file_exists_p(filename)) + if (opt.delete_after && filename != NULL && file_exists_p (filename)) { DEBUGP (("Removing file due to --delete-after in main():\n")); logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); @@ -1462,12 +1626,9 @@ outputting to a regular file.\n")); if (opt.convert_links && !opt.delete_after) convert_all_links (); - log_close (); - for (i = 0; i < nurl; i++) - xfree (url[i]); cleanup (); - return get_exit_status (); + exit (get_exit_status ()); } #endif /* TESTING */ diff --git a/src/openssl.c b/src/openssl.c index 2e23669..3924e41 100644 --- a/src/openssl.c +++ b/src/openssl.c @@ -1,6 +1,6 @@ /* SSL support via OpenSSL library. Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, - 2009, 2010, 2011 Free Software Foundation, Inc. + 2009, 2010, 2011, 2012 Free Software Foundation, Inc. Originally contributed by Christian Fraenkel. This file is part of GNU Wget. @@ -159,7 +159,7 @@ key_type_to_ssl_type (enum keyfile_type type) Returns true on success, false otherwise. */ bool -ssl_init () +ssl_init (void) { SSL_METHOD const *meth; @@ -201,7 +201,9 @@ ssl_init () abort (); } - ssl_ctx = SSL_CTX_new (meth); + /* The type cast below accommodates older OpenSSL versions (0.9.8) + where SSL_CTX_new() is declared without a "const" argument. */ + ssl_ctx = SSL_CTX_new ((SSL_METHOD *)meth); if (!ssl_ctx) goto error; @@ -393,7 +395,7 @@ static struct transport_implementation openssl_transport = { Returns true on success, false on failure. */ bool -ssl_connect_wget (int fd) +ssl_connect_wget (int fd, const char *hostname) { SSL *conn; struct openssl_transport_context *ctx; @@ -404,6 +406,19 @@ ssl_connect_wget (int fd) conn = SSL_new (ssl_ctx); if (!conn) goto error; +#if OPENSSL_VERSION_NUMBER >= 0x0090806fL && !defined(OPENSSL_NO_TLSEXT) + /* If the SSL library was build with support for ServerNameIndication + then use it whenever we have a hostname. If not, don't, ever. */ + if (! is_valid_ip_address (hostname)) + { + if (! SSL_set_tlsext_host_name (conn, hostname)) + { + DEBUGP (("Failed to set TLS server-name indication.")); + goto error; + } + } +#endif + #ifndef FD_TO_SOCKET # define FD_TO_SOCKET(X) (X) #endif diff --git a/src/options.h b/src/options.h index 252bf81..44e0a70 100644 --- a/src/options.h +++ b/src/options.h @@ -74,6 +74,19 @@ struct options bool ignore_case; /* Whether to ignore case when matching dirs and files */ + char *acceptregex_s; /* Patterns to accept (a regex string). */ + char *rejectregex_s; /* Patterns to reject (a regex string). */ + void *acceptregex; /* Patterns to accept (a regex struct). */ + void *rejectregex; /* Patterns to reject (a regex struct). */ + enum { +#ifdef HAVE_LIBPCRE + regex_type_pcre, +#endif + regex_type_posix + } regex_type; /* The regex library. */ + void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */ + bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */ + char **domains; /* See host.c */ char **exclude_domains; bool dns_cache; /* whether we cache DNS lookups. */ @@ -87,6 +100,15 @@ struct options FTP. */ char *output_document; /* The output file to which the documents will be printed. */ + char *warc_filename; /* WARC output filename */ + char *warc_tempdir; /* WARC temp dir */ + char *warc_cdx_dedup_filename; /* CDX file to be used for deduplication. */ + wgint warc_maxsize; /* WARC max archive size */ + bool warc_compression_enabled; /* For GZIP compression. */ + bool warc_digests_enabled; /* For SHA1 digests. */ + bool warc_cdx_enabled; /* Create CDX files? */ + bool warc_keep_log; /* Store the log file in a WARC record. */ + char **warc_user_headers; /* User-defined WARC header(s). */ char *user; /* Generic username */ char *passwd; /* Generic password */ @@ -130,6 +152,8 @@ struct options bool server_response; /* Do we print server response? */ bool save_headers; /* Do we save headers together with file? */ + bool content_on_error; /* Do we output the content when the HTTP + status code indicates a server error */ #ifdef ENABLE_DEBUG bool debug; /* Debugging on/off */ @@ -255,6 +279,7 @@ struct options bool show_all_dns_entries; /* Show all the DNS entries when resolving a name. */ + bool report_bps; /*Output bandwidth in bits format*/ }; extern struct options opt; diff --git a/src/progress.c b/src/progress.c index 219b5be..2e888a9 100644 --- a/src/progress.c +++ b/src/progress.c @@ -766,7 +766,7 @@ update_speed_ring (struct bar_progress *bp, wgint howmuch, double dltime) } #if USE_NLS_PROGRESS_BAR -int +static int count_cols (const char *mbs) { wchar_t wc; @@ -795,7 +795,7 @@ count_cols (const char *mbs) # define count_cols(mbs) ((int)(strlen(mbs))) #endif -const char * +static const char * get_eta (int *bcd) { /* TRANSLATORS: "ETA" is English-centric, but this must @@ -861,7 +861,7 @@ create_image (struct bar_progress *bp, double dl_total_time, bool done) struct bar_progress_hist *hist = &bp->hist; /* The progress bar should look like this: - xx% [=======> ] nn,nnn 12.34K/s eta 36m 51s + xx% [=======> ] nn,nnn 12.34KB/s eta 36m 51s Calculate the geometry. The idea is to assign as much room as possible to the progress bar. The other idea is to never let @@ -873,7 +873,7 @@ create_image (struct bar_progress *bp, double dl_total_time, bool done) "xx% " or "100%" - percentage - 4 chars "[]" - progress bar decorations - 2 chars " nnn,nnn,nnn" - downloaded bytes - 12 chars or very rarely more - " 12.5K/s" - download rate - 8 chars + " 12.5KB/s" - download rate - 9 chars " eta 36m 51s" - ETA - 14 chars "=====>..." - progress bar - the rest @@ -977,10 +977,11 @@ create_image (struct bar_progress *bp, double dl_total_time, bool done) *p++ = ' '; } - /* " 12.52K/s" */ + /* " 12.52Kb/s or 12.52KB/s" */ if (hist->total_time > 0 && hist->total_bytes) { - static const char *short_units[] = { "B/s", "K/s", "M/s", "G/s" }; + static const char *short_units[] = { "B/s", "KB/s", "MB/s", "GB/s" }; + static const char *short_units_bits[] = { "b/s", "Kb/s", "Mb/s", "Gb/s" }; int units = 0; /* Calculate the download speed using the history ring and recent data that hasn't made it to the ring yet. */ @@ -988,7 +989,7 @@ create_image (struct bar_progress *bp, double dl_total_time, bool done) double dltime = hist->total_time + (dl_total_time - bp->recent_start); double dlspeed = calc_rate (dlquant, dltime, &units); sprintf (p, " %4.*f%s", dlspeed >= 99.95 ? 0 : dlspeed >= 9.995 ? 1 : 2, - dlspeed, short_units[units]); + dlspeed, !opt.report_bps ? short_units[units] : short_units_bits[units]); move_to_end (p); } else diff --git a/src/ptimer.c b/src/ptimer.c index c06e8b9..c53b5e7 100644 --- a/src/ptimer.c +++ b/src/ptimer.c @@ -59,9 +59,7 @@ as that of the covered work. */ #include <errno.h> #include <unistd.h> #include <time.h> -#ifdef HAVE_SYS_TIME_H -# include <sys/time.h> -#endif +#include <sys/time.h> /* Cygwin currently (as of 2005-04-08, Cygwin 1.5.14) lacks clock_getres, but still defines _POSIX_TIMERS! Because of that we simply use the diff --git a/src/recur.c b/src/recur.c index 139fe2e..72274fb 100644 --- a/src/recur.c +++ b/src/recur.c @@ -586,6 +586,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, goto out; } } + if (!accept_url (url)) + { + DEBUGP (("%s is excluded/not-included through regex.\n", url)); + goto out; + } /* 6. Check for acceptance/rejection rules. We ignore these rules for directories (no file name to match) and for non-leaf HTMLs, @@ -139,13 +139,16 @@ limit_bandwidth (wgint bytes, struct ptimer *timer) /* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that amount of data and decrease SKIP. Increment *TOTAL by the amount - of data written. */ + of data written. If OUT2 is not NULL, also write BUF to OUT2. + In case of error writing to OUT, -1 is returned. In case of error + writing to OUT2, -2 is returned. In case of any other error, + 1 is returned. */ static int -write_data (FILE *out, const char *buf, int bufsize, wgint *skip, - wgint *written) +write_data (FILE *out, FILE *out2, const char *buf, int bufsize, + wgint *skip, wgint *written) { - if (!out) + if (out == NULL && out2 == NULL) return 1; if (*skip > bufsize) { @@ -161,7 +164,10 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, return 1; } - fwrite (buf, 1, bufsize, out); + if (out != NULL) + fwrite (buf, 1, bufsize, out); + if (out2 != NULL) + fwrite (buf, 1, bufsize, out2); *written += bufsize; /* Immediately flush the downloaded data. This should not hinder @@ -178,9 +184,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, actual justification. (Also, why 16K? Anyone test other values?) */ #ifndef __VMS - fflush (out); + if (out != NULL) + fflush (out); + if (out2 != NULL) + fflush (out2); #endif /* ndef __VMS */ - return !ferror (out); + if (out != NULL && ferror (out)) + return -1; + else if (out2 != NULL && ferror (out2)) + return -2; + else + return 0; } /* Read the contents of file descriptor FD until it the connection @@ -198,13 +212,20 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, the amount of data written to disk. The time it took to download the data is stored to ELAPSED. + If OUT2 is non-NULL, the contents is also written to OUT2. + OUT2 will get an exact copy of the response: if this is a chunked + response, everything -- including the chunk headers -- is written + to OUT2. (OUT will only get the unchunked response.) + The function exits and returns the amount of data read. In case of error while reading data, -1 is returned. In case of error while - writing data, -2 is returned. */ + writing data to OUT, -2 is returned. In case of error while writing + data to OUT2, -3 is returned. */ int fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, - wgint *qtyread, wgint *qtywritten, double *elapsed, int flags) + wgint *qtyread, wgint *qtywritten, double *elapsed, int flags, + FILE *out2) { int ret = 0; #undef max @@ -287,13 +308,24 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, ret = -1; break; } + else if (out2 != NULL) + fwrite (line, 1, strlen (line), out2); remaining_chunk_size = strtol (line, &endl, 16); + xfree (line); + if (remaining_chunk_size == 0) { ret = 0; - if (fd_read_line (fd) == NULL) + line = fd_read_line (fd); + if (line == NULL) ret = -1; + else + { + if (out2 != NULL) + fwrite (line, 1, strlen (line), out2); + xfree (line); + } break; } } @@ -343,20 +375,30 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, if (ret > 0) { sum_read += ret; - if (!write_data (out, dlbuf, ret, &skip, &sum_written)) + int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written); + if (write_res != 0) { - ret = -2; + ret = (write_res == -3) ? -3 : -2; goto out; } if (chunked) { remaining_chunk_size -= ret; if (remaining_chunk_size == 0) - if (fd_read_line (fd) == NULL) - { - ret = -1; - break; - } + { + char *line = fd_read_line (fd); + if (line == NULL) + { + ret = -1; + break; + } + else + { + if (out2 != NULL) + fwrite (line, 1, strlen (line), out2); + xfree (line); + } + } } } @@ -578,6 +620,7 @@ retr_rate (wgint bytes, double secs) { static char res[20]; static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" }; + static const char *rate_names_bits[] = {"b/s", "Kb/s", "Mb/s", "Gb/s" }; int units; double dlrate = calc_rate (bytes, secs, &units); @@ -585,7 +628,7 @@ retr_rate (wgint bytes, double secs) e.g. "1022", "247", "12.5", "2.38". */ sprintf (res, "%.*f %s", dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2, - dlrate, rate_names[units]); + dlrate, !opt.report_bps ? rate_names[units]: rate_names_bits[units]); return res; } @@ -602,6 +645,11 @@ double calc_rate (wgint bytes, double secs, int *units) { double dlrate; + double bibyte = 1000.0; + + if (!opt.report_bps) + bibyte = 1024.0; + assert (secs >= 0); assert (bytes >= 0); @@ -613,16 +661,17 @@ calc_rate (wgint bytes, double secs, int *units) 0 and the timer's resolution, assume half the resolution. */ secs = ptimer_resolution () / 2.0; - dlrate = bytes / secs; - if (dlrate < 1024.0) + dlrate = convert_to_bits (bytes) / secs; + if (dlrate < bibyte) *units = 0; - else if (dlrate < 1024.0 * 1024.0) - *units = 1, dlrate /= 1024.0; - else if (dlrate < 1024.0 * 1024.0 * 1024.0) - *units = 2, dlrate /= (1024.0 * 1024.0); + else if (dlrate < (bibyte * bibyte)) + *units = 1, dlrate /= bibyte; + else if (dlrate < (bibyte * bibyte * bibyte)) + *units = 2, dlrate /= (bibyte * bibyte); + else /* Maybe someone will need this, one day. */ - *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0); + *units = 3, dlrate /= (bibyte * bibyte * bibyte); return dlrate; } @@ -883,10 +932,10 @@ retrieve_url (struct url * orig_parsed, const char *origurl, char **file, register_redirection (origurl, u->url); if (*dt & TEXTHTML) - register_html (u->url, local_file); + register_html (local_file); if (*dt & TEXTCSS) - register_css (u->url, local_file); + register_css (local_file); } if (file) @@ -50,7 +50,7 @@ enum { rb_chunked_transfer_encoding = 4 }; -int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int); +int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *); typedef const char *(*hunk_terminator_t) (const char *, const char *, int); diff --git a/src/spider.c b/src/spider.c index ae2f392..dad9a23 100644 --- a/src/spider.c +++ b/src/spider.c @@ -45,7 +45,7 @@ static struct hash_table *nonexisting_urls_set; /* Cleanup the data structures associated with this file. */ -void +static void spider_cleanup (void) { if (nonexisting_urls_set) @@ -1,6 +1,6 @@ /* SSL support. Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, - 2009, 2010, 2011 Free Software Foundation, Inc. + 2009, 2010, 2011, 2012 Free Software Foundation, Inc. Originally contributed by Christian Fraenkel. This file is part of GNU Wget. @@ -33,7 +33,7 @@ as that of the covered work. */ #define GEN_SSLFUNC_H bool ssl_init (void); -bool ssl_connect_wget (int); +bool ssl_connect_wget (int, const char *); bool ssl_check_certificate (int, const char *); #endif /* GEN_SSLFUNC_H */ @@ -46,6 +46,8 @@ const char *test_append_uri_pathel(); const char *test_are_urls_equal(); const char *test_is_robots_txt_url(); +const char *program_argstring = "TEST"; + int tests_run; static const char * @@ -1502,9 +1502,9 @@ url_file_name (const struct url *u, char *replaced_filename) { struct growable fnres; /* stands for "file name result" */ - const char *u_file, *u_query; + const char *u_file; char *fname, *unique; - char *index_filename = "index.html"; /* The default index file is index.html */ + const char *index_filename = "index.html"; /* The default index file is index.html */ fnres.base = NULL; fnres.size = 0; @@ -1561,12 +1561,11 @@ url_file_name (const struct url *u, char *replaced_filename) u_file = *u->file ? u->file : index_filename; append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres); - /* Append "?query" to the file name. */ - u_query = u->query && *u->query ? u->query : NULL; - if (u_query) + /* Append "?query" to the file name, even if empty */ + if (u->query) { append_char (FN_QUERY_SEP, &fnres); - append_uri_pathel (u_query, u_query + strlen (u_query), + append_uri_pathel (u->query, u->query + strlen (u->query), true, &fnres); } } diff --git a/src/utils.c b/src/utils.c index 4950ab2..567dc35 100644 --- a/src/utils.c +++ b/src/utils.c @@ -59,12 +59,12 @@ as that of the covered work. */ # endif #endif +#include <sys/time.h> + #include <sys/stat.h> /* For TIOCGWINSZ and friends: */ -#ifdef HAVE_SYS_IOCTL_H -# include <sys/ioctl.h> -#endif +#include <sys/ioctl.h> #ifdef HAVE_TERMIOS_H # include <termios.h> #endif @@ -73,6 +73,11 @@ as that of the covered work. */ #include <signal.h> #include <setjmp.h> +#include <regex.h> +#ifdef HAVE_LIBPCRE +# include <pcre.h> +#endif + #ifndef HAVE_SIGSETJMP /* If sigsetjmp is a macro, configure won't pick it up. */ # ifdef sigsetjmp @@ -769,8 +774,7 @@ fopen_excl (const char *fname, int binary) open_id = 13; fd = open( fname, /* File name. */ flags, /* Flags. */ - 0777, /* Mode for default protection. -*/ + 0777, /* Mode for default protection. */ "rfm=stmlf", /* Stream_LF. */ OPEN_OPT_ARGS); /* Access callback. */ } @@ -918,6 +922,19 @@ acceptable (const char *s) return true; } +/* Determine whether an URL is acceptable to be followed, according to + regex patterns to accept/reject. */ +bool +accept_url (const char *s) +{ + if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s)) + return false; + if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s)) + return false; + + return true; +} + /* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p() will return true if and only if D2 begins with `/something/' or is exactly '/something'. */ @@ -1826,6 +1843,17 @@ number_to_static_string (wgint number) ringpos = (ringpos + 1) % RING_SIZE; return buf; } + +/* Converts the byte to bits format if --report-bps option is enabled + */ +wgint +convert_to_bits (wgint num) +{ + if (opt.report_bps) + return num * 8; + return num; +} + /* Determine the width of the terminal we're running on. If that's not possible, return 0. */ @@ -2299,6 +2327,89 @@ base64_decode (const char *base64, void *dest) return q - (char *) dest; } +#ifdef HAVE_LIBPCRE +/* Compiles the PCRE regex. */ +void * +compile_pcre_regex (const char *str) +{ + const char *errbuf; + int erroffset; + pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0); + if (! regex) + { + fprintf (stderr, _("Invalid regular expression %s, %s\n"), + quote (str), errbuf); + return false; + } + return regex; +} +#endif + +/* Compiles the POSIX regex. */ +void * +compile_posix_regex (const char *str) +{ + regex_t *regex = xmalloc (sizeof (regex_t)); + int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB); + if (errcode != 0) + { + int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0); + char *errbuf = xmalloc (errbuf_size); + regerror (errcode, (regex_t *) regex, errbuf, errbuf_size); + fprintf (stderr, _("Invalid regular expression %s, %s\n"), + quote (str), errbuf); + xfree (errbuf); + return NULL; + } + + return regex; +} + +#ifdef HAVE_LIBPCRE +#define OVECCOUNT 30 +/* Matches a PCRE regex. */ +bool +match_pcre_regex (const void *regex, const char *str) +{ + int l = strlen (str); + int ovector[OVECCOUNT]; + + int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT); + if (rc == PCRE_ERROR_NOMATCH) + return false; + else if (rc < 0) + { + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (str), rc); + return false; + } + else + return true; +} +#undef OVECCOUNT +#endif + +/* Matches a POSIX regex. */ +bool +match_posix_regex (const void *regex, const char *str) +{ + int rc = regexec ((regex_t *) regex, str, 0, NULL, 0); + if (rc == REG_NOMATCH) + return false; + else if (rc == 0) + return true; + else + { + int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0); + char *errbuf = xmalloc (errbuf_size); + regerror (rc, opt.acceptregex, errbuf, errbuf_size); + logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"), + quote (str), rc); + xfree (errbuf); + return false; + } +} + #undef IS_ASCII #undef NEXT_CHAR diff --git a/src/utils.h b/src/utils.h index 8b1a8a1..409cdc5 100644 --- a/src/utils.h +++ b/src/utils.h @@ -90,6 +90,7 @@ char *file_merge (const char *, const char *); int fnmatch_nocase (const char *, const char *, int); bool acceptable (const char *); +bool accept_url (const char *); bool accdir (const char *s); char *suffix (const char *s); bool match_tail (const char *, const char *, bool); @@ -127,6 +128,7 @@ char *human_readable (HR_NUMTYPE); int numdigit (wgint); char *number_to_string (char *, wgint); char *number_to_static_string (wgint); +wgint convert_to_bits (wgint); int determine_screen_width (void); int random_number (int); @@ -141,6 +143,14 @@ void xsleep (double); int base64_encode (const void *, int, char *); int base64_decode (const char *, void *); +#ifdef HAVE_LIBPCRE +void *compile_pcre_regex (const char *); +bool match_pcre_regex (const void *, const char *); +#endif + +void *compile_posix_regex (const char *); +bool match_posix_regex (const void *, const char *); + void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *)); const char *print_decimal (double); diff --git a/src/warc.c b/src/warc.c new file mode 100644 index 0000000..69f80be --- /dev/null +++ b/src/warc.c @@ -0,0 +1,1440 @@ +/* Utility functions for writing WARC files. + Copyright (C) 2011, 2012 Free Software Foundation, Inc. + +This file is part of GNU Wget. + +GNU Wget is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at +your option) any later version. + +GNU Wget is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Wget. If not, see <http://www.gnu.org/licenses/>. + +Additional permission under GNU GPL version 3 section 7 + +If you modify this program, or any covered work, by linking or +combining it with the OpenSSL project's OpenSSL library (or a +modified version of that library), containing parts covered by the +terms of the OpenSSL or SSLeay licenses, the Free Software Foundation +grants you additional permission to convey the resulting work. +Corresponding Source for a non-source form of such a combination +shall include the source code for the parts of OpenSSL used as well +as that of the covered work. */ + +#define _GNU_SOURCE + +#include "wget.h" +#include "hash.h" +#include "utils.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <time.h> +#include <tmpdir.h> +#include <sha1.h> +#include <base32.h> +#include <unistd.h> +#ifdef HAVE_LIBZ +#include <zlib.h> +#endif +#ifdef HAVE_LIBUUID +#include <uuid/uuid.h> +#endif + +#ifndef WINDOWS +#include <libgen.h> +#endif + +#include "warc.h" + +extern char *version_string; + +/* Set by main in main.c */ +extern char *program_argstring; + + +/* The log file (a temporary file that contains a copy + of the wget log). */ +static FILE *warc_log_fp; + +/* The manifest file (a temporary file that contains the + warcinfo uuid of every file in this crawl). */ +static FILE *warc_manifest_fp; + +/* The current WARC file (or NULL, if WARC is disabled). */ +static FILE *warc_current_file; + +#ifdef HAVE_LIBZ +/* The gzip stream for the current WARC file + (or NULL, if WARC or gzip is disabled). */ +static gzFile warc_current_gzfile; + +/* The offset of the current gzip record in the WARC file. */ +static off_t warc_current_gzfile_offset; + +/* The uncompressed size (so far) of the current record. */ +static off_t warc_current_gzfile_uncompressed_size; +# endif + +/* This is true until a warc_write_* method fails. */ +static bool warc_write_ok; + +/* The current CDX file (or NULL, if CDX is disabled). */ +static FILE *warc_current_cdx_file; + +/* The record id of the warcinfo record of the current WARC file. */ +static char *warc_current_warcinfo_uuid_str; + +/* The file name of the current WARC file. */ +static char *warc_current_filename; + +/* The serial number of the current WARC file. This number is + incremented each time a new file is opened and is used in the + WARC file's filename. */ +static int warc_current_file_number; + +/* The table of CDX records, if deduplication is enabled. */ +struct hash_table * warc_cdx_dedup_table; + +static bool warc_start_new_file (bool meta); + + +struct warc_cdx_record +{ + char *url; + char *uuid; + char digest[SHA1_DIGEST_SIZE]; +}; + +static unsigned long +warc_hash_sha1_digest (const void *key) +{ + /* We just use some of the first bytes of the digest. */ + unsigned long v = 0; + memcpy (&v, key, sizeof (unsigned long)); + return v; +} + +static int +warc_cmp_sha1_digest (const void *digest1, const void *digest2) +{ + return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE); +} + + + +/* Writes SIZE bytes from BUFFER to the current WARC file, + through gzwrite if compression is enabled. + Returns the number of uncompressed bytes written. */ +static size_t +warc_write_buffer (const char *buffer, size_t size) +{ +#ifdef HAVE_LIBZ + if (warc_current_gzfile) + { + warc_current_gzfile_uncompressed_size += size; + return gzwrite (warc_current_gzfile, buffer, size); + } + else +#endif + return fwrite (buffer, 1, size, warc_current_file); +} + +/* Writes STR to the current WARC file. + Returns false and set warc_write_ok to false if there + is an error. */ +static bool +warc_write_string (const char *str) +{ + if (!warc_write_ok) + return false; + + size_t n = strlen (str); + if (n != warc_write_buffer (str, n)) + warc_write_ok = false; + + return warc_write_ok; +} + + +#define EXTRA_GZIP_HEADER_SIZE 12 +#define GZIP_STATIC_HEADER_SIZE 10 +#define FLG_FEXTRA 0x04 +#define OFF_FLG 3 + +/* Starts a new WARC record. Writes the version header. + If opt.warc_maxsize is set and the current file is becoming + too large, this will open a new WARC file. + + If compression is enabled, this will start a new + gzip stream in the current WARC file. + + Returns false and set warc_write_ok to false if there + is an error. */ +static bool +warc_write_start_record (void) +{ + if (!warc_write_ok) + return false; + + fflush (warc_current_file); + if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize) + warc_start_new_file (false); + +#ifdef HAVE_LIBZ + /* Start a GZIP stream, if required. */ + if (opt.warc_compression_enabled) + { + /* Record the starting offset of the new record. */ + warc_current_gzfile_offset = ftello (warc_current_file); + + /* Reserve space for the extra GZIP header field. + In warc_write_end_record we will fill this space + with information about the uncompressed and + compressed size of the record. */ + fprintf (warc_current_file, "XXXXXXXXXXXX"); + fflush (warc_current_file); + + /* Start a new GZIP stream. */ + warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb9"); + warc_current_gzfile_uncompressed_size = 0; + + if (warc_current_gzfile == NULL) + { + logprintf (LOG_NOTQUIET, +_("Error opening GZIP stream to WARC file.\n")); + warc_write_ok = false; + return false; + } + } +#endif + + warc_write_string ("WARC/1.0\r\n"); + return warc_write_ok; +} + +/* Writes a WARC header to the current WARC record. + This method may be run after warc_write_start_record and + before warc_write_block_from_file. */ +static bool +warc_write_header (const char *name, const char *value) +{ + if (value) + { + warc_write_string (name); + warc_write_string (": "); + warc_write_string (value); + warc_write_string ("\r\n"); + } + return warc_write_ok; +} + +/* Copies the contents of DATA_IN to the WARC record. + Adds a Content-Length header to the WARC record. + Run this method after warc_write_header, + then run warc_write_end_record. */ +static bool +warc_write_block_from_file (FILE *data_in) +{ + /* Add the Content-Length header. */ + char *content_length; + fseeko (data_in, 0L, SEEK_END); + if (! asprintf (&content_length, "%ld", ftello (data_in))) + { + warc_write_ok = false; + return false; + } + warc_write_header ("Content-Length", content_length); + free (content_length); + + /* End of the WARC header section. */ + warc_write_string ("\r\n"); + + if (fseeko (data_in, 0L, SEEK_SET) != 0) + warc_write_ok = false; + + /* Copy the data in the file to the WARC record. */ + char buffer[BUFSIZ]; + size_t s; + while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0) + { + if (warc_write_buffer (buffer, s) < s) + warc_write_ok = false; + } + + return warc_write_ok; +} + +/* Run this method to close the current WARC record. + + If compression is enabled, this method closes the + current GZIP stream and fills the extra GZIP header + with the uncompressed and compressed length of the + record. */ +static bool +warc_write_end_record (void) +{ + warc_write_buffer ("\r\n\r\n", 4); + +#ifdef HAVE_LIBZ + /* We start a new gzip stream for each record. */ + if (warc_write_ok && warc_current_gzfile) + { + if (gzclose (warc_current_gzfile) != Z_OK) + { + warc_write_ok = false; + return false; + } + + fflush (warc_current_file); + fseeko (warc_current_file, 0, SEEK_END); + + /* The WARC standard suggests that we add 'skip length' data in the + extra header field of the GZIP stream. + + In warc_write_start_record we reserved space for this extra header. + This extra space starts at warc_current_gzfile_offset and fills + EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at + warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE. + + We need to do three things: + 1. Move the static GZIP header to warc_current_gzfile_offset; + 2. Set the FEXTRA flag in the GZIP header; + 3. Write the extra GZIP header after the static header, that is, + starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE. + */ + + /* Calculate the uncompressed and compressed sizes. */ + off_t current_offset = ftello (warc_current_file); + off_t uncompressed_size = current_offset - warc_current_gzfile_offset; + off_t compressed_size = warc_current_gzfile_uncompressed_size; + + /* Go back to the static GZIP header. */ + fseeko (warc_current_file, warc_current_gzfile_offset + + EXTRA_GZIP_HEADER_SIZE, SEEK_SET); + + /* Read the header. */ + char static_header[GZIP_STATIC_HEADER_SIZE]; + size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, + warc_current_file); + if (result != GZIP_STATIC_HEADER_SIZE) + { + warc_write_ok = false; + return false; + } + + /* Set the FEXTRA flag in the flags byte of the header. */ + static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA; + + /* Write the header back to the file, but starting at + warc_current_gzfile_offset. */ + fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET); + fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); + + /* Prepare the extra GZIP header. */ + char extra_header[EXTRA_GZIP_HEADER_SIZE]; + /* XLEN, the length of the extra header fields. */ + extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255); + extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255; + /* The extra header field identifier for the WARC skip length. */ + extra_header[2] = 's'; + extra_header[3] = 'l'; + /* The size of the uncompressed record. */ + extra_header[4] = (uncompressed_size & 255); + extra_header[5] = (uncompressed_size >> 8) & 255; + extra_header[6] = (uncompressed_size >> 16) & 255; + extra_header[7] = (uncompressed_size >> 24) & 255; + /* The size of the compressed record. */ + extra_header[8] = (compressed_size & 255); + extra_header[9] = (compressed_size >> 8) & 255; + extra_header[10] = (compressed_size >> 16) & 255; + extra_header[11] = (compressed_size >> 24) & 255; + + /* Write the extra header after the static header. */ + fseeko (warc_current_file, warc_current_gzfile_offset + + GZIP_STATIC_HEADER_SIZE, SEEK_SET); + fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file); + + /* Done, move back to the end of the file. */ + fflush (warc_current_file); + fseeko (warc_current_file, 0, SEEK_END); + } +#endif /* HAVE_LIBZ */ + + return warc_write_ok; +} + + +/* Writes the WARC-Date header for the given timestamp to + the current WARC record. + If timestamp is NULL, the current time will be used. */ +static bool +warc_write_date_header (const char *timestamp) +{ + if (timestamp == NULL) + { + char current_timestamp[21]; + warc_timestamp (current_timestamp); + timestamp = current_timestamp; + } + return warc_write_header ("WARC-Date", timestamp); +} + +/* Writes the WARC-IP-Address header for the given IP to + the current WARC record. If IP is NULL, no header will + be written. */ +static bool +warc_write_ip_header (ip_address *ip) +{ + if (ip != NULL) + return warc_write_header ("WARC-IP-Address", print_address (ip)); + else + return warc_write_ok; +} + + +/* warc_sha1_stream_with_payload is a modified copy of sha1_stream + from gnulib/sha1.c. This version calculates two digests in one go. + + Compute SHA1 message digests for bytes read from STREAM. The + digest of the complete file will be written into the 16 bytes + beginning at RES_BLOCK. + + If payload_offset >= 0, a second digest will be calculated of the + portion of the file starting at payload_offset and continuing to + the end of the file. The digest number will be written into the + 16 bytes beginning ad RES_PAYLOAD. */ +static int +warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, + off_t payload_offset) +{ +#define BLOCKSIZE 32768 + + struct sha1_ctx ctx_block; + struct sha1_ctx ctx_payload; + off_t pos; + off_t sum; + + char *buffer = malloc (BLOCKSIZE + 72); + if (!buffer) + return 1; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx_block); + if (payload_offset >= 0) + sha1_init_ctx (&ctx_payload); + + pos = 0; + + /* Iterate over full file contents. */ + while (1) + { + /* We read the file in blocks of BLOCKSIZE bytes. One call of the + computation function processes the whole buffer so that with the + next round of the loop another block can be read. */ + off_t n; + sum = 0; + + /* Read block. Take care for partial reads. */ + while (1) + { + n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); + + sum += n; + pos += n; + + if (sum == BLOCKSIZE) + break; + + if (n == 0) + { + /* Check for the error flag IFF N == 0, so that we don't + exit the loop after a partial read due to e.g., EAGAIN + or EWOULDBLOCK. */ + if (ferror (stream)) + { + free (buffer); + return 1; + } + goto process_partial_block; + } + + /* We've read at least one byte, so ignore errors. But always + check for EOF, since feof may be true even though N > 0. + Otherwise, we could end up calling fread after EOF. */ + if (feof (stream)) + goto process_partial_block; + } + + /* Process buffer with BLOCKSIZE bytes. Note that + BLOCKSIZE % 64 == 0 + */ + sha1_process_block (buffer, BLOCKSIZE, &ctx_block); + if (payload_offset >= 0 && payload_offset < pos) + { + /* At least part of the buffer contains data from payload. */ + off_t start_of_payload = payload_offset - (pos - BLOCKSIZE); + if (start_of_payload <= 0) + /* All bytes in the buffer belong to the payload. */ + start_of_payload = 0; + + /* Process the payload part of the buffer. + Note: we can't use sha1_process_block here even if we + process the complete buffer. Because the payload doesn't + have to start with a full block, there may still be some + bytes left from the previous buffer. Therefore, we need + to continue with sha1_process_bytes. */ + sha1_process_bytes (buffer + start_of_payload, + BLOCKSIZE - start_of_payload, &ctx_payload); + } + } + + process_partial_block:; + + /* Process any remaining bytes. */ + if (sum > 0) + { + sha1_process_bytes (buffer, sum, &ctx_block); + if (payload_offset >= 0 && payload_offset < pos) + { + /* At least part of the buffer contains data from payload. */ + off_t start_of_payload = payload_offset - (pos - sum); + if (start_of_payload <= 0) + /* All bytes in the buffer belong to the payload. */ + start_of_payload = 0; + + /* Process the payload part of the buffer. */ + sha1_process_bytes (buffer + start_of_payload, + sum - start_of_payload, &ctx_payload); + } + } + + /* Construct result in desired memory. */ + sha1_finish_ctx (&ctx_block, res_block); + if (payload_offset >= 0) + sha1_finish_ctx (&ctx_payload, res_payload); + free (buffer); + return 0; + +#undef BLOCKSIZE +} + +/* Converts the SHA1 digest to a base32-encoded string. + "sha1:DIGEST\0" (Allocates a new string for the response.) */ +static char * +warc_base32_sha1_digest (char *sha1_digest) +{ + // length: "sha1:" + digest + "\0" + char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 ); + base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, + BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1); + memcpy (sha1_base32, "sha1:", 5); + sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0'; + return sha1_base32; +} + + +/* Sets the digest headers of the record. + This method will calculate the block digest and, if payload_offset >= 0, + will also calculate the payload digest of the payload starting at the + provided offset. */ +static void +warc_write_digest_headers (FILE *file, long payload_offset) +{ + if (opt.warc_digests_enabled) + { + /* Calculate the block and payload digests. */ + char sha1_res_block[SHA1_DIGEST_SIZE]; + char sha1_res_payload[SHA1_DIGEST_SIZE]; + + rewind (file); + if (warc_sha1_stream_with_payload (file, sha1_res_block, + sha1_res_payload, payload_offset) == 0) + { + char *digest; + + digest = warc_base32_sha1_digest (sha1_res_block); + warc_write_header ("WARC-Block-Digest", digest); + free (digest); + + if (payload_offset >= 0) + { + digest = warc_base32_sha1_digest (sha1_res_payload); + warc_write_header ("WARC-Payload-Digest", digest); + free (digest); + } + } + } +} + + +/* Fills timestamp with the current time and date. + The UTC time is formatted following ISO 8601, as required + for use in the WARC-Date header. + The timestamp will be 21 characters long. */ +void +warc_timestamp (char *timestamp) +{ + time_t rawtime; + struct tm * timeinfo; + time ( &rawtime ); + timeinfo = gmtime (&rawtime); + strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo); +} + +#ifdef HAVE_LIBUUID +/* Fills urn_str with a UUID in the format required + for the WARC-Record-Id header. + The string will be 47 characters long. */ +void +warc_uuid_str (char *urn_str) +{ + char uuid_str[37]; + + uuid_t record_id; + uuid_generate (record_id); + uuid_unparse (record_id, uuid_str); + + sprintf (urn_str, "<urn:uuid:%s>", uuid_str); +} +#else +/* Fills urn_str with a UUID based on random numbers in the format + required for the WARC-Record-Id header. + (See RFC 4122, UUID version 4.) + + Note: this is a fallback method, it is much better to use the + methods provided by libuuid. + + The string will be 47 characters long. */ +void +warc_uuid_str (char *urn_str) +{ + // RFC 4122, a version 4 UUID with only random numbers + + unsigned char uuid_data[16]; + int i; + for (i=0; i<16; i++) + uuid_data[i] = random_number (255); + + // Set the four most significant bits (bits 12 through 15) of the + // time_hi_and_version field to the 4-bit version number + uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40; + + // Set the two most significant bits (bits 6 and 7) of the + // clock_seq_hi_and_reserved to zero and one, respectively. + uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80; + + sprintf (urn_str, + "<urn:uuid:%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x>", + uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4], + uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9], + uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14], + uuid_data[15]); +} +#endif + +/* Write a warcinfo record to the current file. + Updates warc_current_warcinfo_uuid_str. */ +static bool +warc_write_warcinfo_record (char *filename) +{ + /* Write warc-info record as the first record of the file. */ + /* We add the record id of this info record to the other records in the + file. */ + warc_current_warcinfo_uuid_str = (char *) malloc (48); + warc_uuid_str (warc_current_warcinfo_uuid_str); + + char timestamp[22]; + warc_timestamp (timestamp); + + char *filename_copy, *filename_basename; + filename_copy = strdup (filename); + filename_basename = strdup (basename (filename_copy)); + + warc_write_start_record (); + warc_write_header ("WARC-Type", "warcinfo"); + warc_write_header ("Content-Type", "application/warc-fields"); + warc_write_header ("WARC-Date", timestamp); + warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Filename", filename_basename); + + /* Create content. */ + FILE *warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + free (filename_copy); + free (filename_basename); + return false; + } + + fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE); + fprintf (warc_tmp, "format: WARC File Format 1.0\r\n"); + fprintf (warc_tmp, +"conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"); + fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off")); + fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring); + /* Add the user headers, if any. */ + if (opt.warc_user_headers) + { + int i; + for (i = 0; opt.warc_user_headers[i]; i++) + fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]); + } + fprintf(warc_tmp, "\r\n"); + + warc_write_digest_headers (warc_tmp, -1); + warc_write_block_from_file (warc_tmp); + warc_write_end_record (); + + if (! warc_write_ok) + logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n")); + + free (filename_copy); + free (filename_basename); + fclose (warc_tmp); + return warc_write_ok; +} + +/* Opens a new WARC file. + If META is true, generates a filename ending with 'meta.warc.gz'. + + This method will: + 1. close the current WARC file (if there is one); + 2. increment warc_current_file_number; + 3. open a new WARC file; + 4. write the initial warcinfo record. + + Returns true on success, false otherwise. + */ +static bool +warc_start_new_file (bool meta) +{ + if (opt.warc_filename == NULL) + return false; + + if (warc_current_file != NULL) + fclose (warc_current_file); + if (warc_current_warcinfo_uuid_str) + free (warc_current_warcinfo_uuid_str); + if (warc_current_filename) + free (warc_current_filename); + + warc_current_file_number++; + + int base_filename_length = strlen (opt.warc_filename); + /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */ + char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); + warc_current_filename = new_filename; + +#ifdef HAVE_LIBZ + const char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc"); +#else + const char *extension = "warc"; +#endif + + /* If max size is enabled, we add a serial number to the file names. */ + if (meta) + sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension); + else if (opt.warc_maxsize > 0) + { + sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, + warc_current_file_number, extension); + } + else + sprintf (new_filename, "%s.%s", opt.warc_filename, extension); + + logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename)); + + /* Open the WARC file. */ + warc_current_file = fopen (new_filename, "wb+"); + if (warc_current_file == NULL) + { + logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), + quote (new_filename)); + return false; + } + + if (! warc_write_warcinfo_record (new_filename)) + return false; + + /* Add warcinfo uuid to manifest. */ + if (warc_manifest_fp) + fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str); + + return true; +} + +/* Opens the CDX file for output. */ +static bool +warc_start_cdx_file (void) +{ + int filename_length = strlen (opt.warc_filename); + char *cdx_filename = alloca (filename_length + 4 + 1); + memcpy (cdx_filename, opt.warc_filename, filename_length); + memcpy (cdx_filename + filename_length, ".cdx", 5); + warc_current_cdx_file = fopen (cdx_filename, "a+"); + if (warc_current_cdx_file == NULL) + return false; + + /* Print the CDX header. + * + * a - original url + * b - date + * m - mime type + * s - response code + * k - new style checksum + * r - redirect + * M - meta tags + * V - compressed arc file offset + * g - file name + * u - record-id + */ + fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n"); + fflush (warc_current_cdx_file); + + return true; +} + +#define CDX_FIELDSEP " \t\r\n" + +/* Parse the CDX header and find the field numbers of the original url, + checksum and record ID fields. */ +static bool +warc_parse_cdx_header (char *lineptr, int *field_num_original_url, + int *field_num_checksum, int *field_num_record_id) +{ + *field_num_original_url = -1; + *field_num_checksum = -1; + *field_num_record_id = -1; + + char *token; + char *save_ptr; + token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); + + if (token != NULL && strcmp (token, "CDX") == 0) + { + int field_num = 0; + while (token != NULL) + { + token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr); + if (token != NULL) + { + switch (token[0]) + { + case 'a': + *field_num_original_url = field_num; + break; + case 'k': + *field_num_checksum = field_num; + break; + case 'u': + *field_num_record_id = field_num; + break; + } + } + field_num++; + } + } + + return *field_num_original_url != -1 + && *field_num_checksum != -1 + && *field_num_record_id != -1; +} + +/* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */ +static void +warc_process_cdx_line (char *lineptr, int field_num_original_url, + int field_num_checksum, int field_num_record_id) +{ + char *original_url = NULL; + char *checksum = NULL; + char *record_id = NULL; + + char *token; + char *save_ptr; + token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); + + /* Read this line to get the fields we need. */ + int field_num = 0; + while (token != NULL) + { + char **val; + if (field_num == field_num_original_url) + val = &original_url; + else if (field_num == field_num_checksum) + val = &checksum; + else if (field_num == field_num_record_id) + val = &record_id; + else + val = NULL; + + if (val != NULL) + *val = strdup (token); + + token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr); + field_num++; + } + + if (original_url != NULL && checksum != NULL && record_id != NULL) + { + /* For some extra efficiency, we decode the base32 encoded + checksum value. This should produce exactly SHA1_DIGEST_SIZE + bytes. */ + size_t checksum_l; + char * checksum_v; + base32_decode_alloc (checksum, strlen (checksum), &checksum_v, + &checksum_l); + free (checksum); + + if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) + { + /* This is a valid line with a valid checksum. */ + struct warc_cdx_record *rec; + rec = malloc (sizeof (struct warc_cdx_record)); + rec->url = original_url; + rec->uuid = record_id; + memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE); + hash_table_put (warc_cdx_dedup_table, rec->digest, rec); + free (checksum_v); + } + else + { + free (original_url); + if (checksum_v != NULL) + free (checksum_v); + free (record_id); + } + } + else + { + xfree_null(checksum); + xfree_null(original_url); + xfree_null(record_id); + } +} + +/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills + the warc_cdx_dedup_table. */ +static bool +warc_load_cdx_dedup_file (void) +{ + FILE *f = fopen (opt.warc_cdx_dedup_filename, "r"); + if (f == NULL) + return false; + + int field_num_original_url = -1; + int field_num_checksum = -1; + int field_num_record_id = -1; + + char *lineptr = NULL; + size_t n = 0; + ssize_t line_length; + + /* The first line should contain the CDX header. + Format: " CDX x x x x x" + where x are field type indicators. For our purposes, we only + need 'a' (the original url), 'k' (the SHA1 checksum) and + 'u' (the WARC record id). */ + line_length = getline (&lineptr, &n, f); + if (line_length != -1) + warc_parse_cdx_header (lineptr, &field_num_original_url, + &field_num_checksum, &field_num_record_id); + + /* If the file contains all three fields, read the complete file. */ + if (field_num_original_url == -1 + || field_num_checksum == -1 + || field_num_record_id == -1) + { + if (field_num_original_url == -1) + logprintf (LOG_NOTQUIET, +_("CDX file does not list original urls. (Missing column 'a'.)\n")); + if (field_num_checksum == -1) + logprintf (LOG_NOTQUIET, +_("CDX file does not list checksums. (Missing column 'k'.)\n")); + if (field_num_record_id == -1) + logprintf (LOG_NOTQUIET, +_("CDX file does not list record ids. (Missing column 'u'.)\n")); + } + else + { + /* Initialize the table. */ + warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, + warc_cmp_sha1_digest); + + do + { + line_length = getline (&lineptr, &n, f); + if (line_length != -1) + { + warc_process_cdx_line (lineptr, field_num_original_url, + field_num_checksum, field_num_record_id); + } + + } + while (line_length != -1); + + /* Print results. */ + int nrecords = hash_table_count (warc_cdx_dedup_table); + logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n", + "Loaded %d records from CDX.\n\n", + nrecords), + nrecords); + } + + free (lineptr); + fclose (f); + + return true; +} +#undef CDX_FIELDSEP + +/* Returns the existing duplicate CDX record for the given url and payload + digest. Returns NULL if the url is not found or if the payload digest + does not match, or if CDX deduplication is disabled. */ +static struct warc_cdx_record * +warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) +{ + if (warc_cdx_dedup_table == NULL) + return NULL; + + char *key; + struct warc_cdx_record *rec_existing; + int found = hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, + &key, &rec_existing); + + if (found && strcmp (rec_existing->url, url) == 0) + return rec_existing; + else + return NULL; +} + +/* Initializes the WARC writer (if opt.warc_filename is set). + This should be called before any WARC record is written. */ +void +warc_init (void) +{ + warc_write_ok = true; + + if (opt.warc_filename != NULL) + { + if (opt.warc_cdx_dedup_filename != NULL) + { + if (! warc_load_cdx_dedup_file ()) + { + logprintf (LOG_NOTQUIET, + _("Could not read CDX file %s for deduplication.\n"), + quote (opt.warc_cdx_dedup_filename)); + exit(1); + } + } + + warc_manifest_fp = warc_tempfile (); + if (warc_manifest_fp == NULL) + { + logprintf (LOG_NOTQUIET, + _("Could not open temporary WARC manifest file.\n")); + exit(1); + } + + if (opt.warc_keep_log) + { + warc_log_fp = warc_tempfile (); + if (warc_log_fp == NULL) + { + logprintf (LOG_NOTQUIET, + _("Could not open temporary WARC log file.\n")); + exit(1); + } + log_set_warc_log_fp (warc_log_fp); + } + + warc_current_file_number = -1; + if (! warc_start_new_file (false)) + { + logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n")); + exit(1); + } + + if (opt.warc_cdx_enabled) + { + if (! warc_start_cdx_file ()) + { + logprintf (LOG_NOTQUIET, + _("Could not open CDX file for output.\n")); + exit(1); + } + } + } +} + +/* Writes metadata (manifest, configuration, log file) to the WARC file. */ +static void +warc_write_metadata (void) +{ + /* If there are multiple WARC files, the metadata should be written to a separate file. */ + if (opt.warc_maxsize > 0) + warc_start_new_file (true); + + char manifest_uuid [48]; + warc_uuid_str (manifest_uuid); + + fflush (warc_manifest_fp); + warc_write_resource_record (manifest_uuid, + "metadata://gnu.org/software/wget/warc/MANIFEST.txt", + NULL, NULL, NULL, "text/plain", + warc_manifest_fp, -1); + /* warc_write_resource_record has closed warc_manifest_fp. */ + + FILE * warc_tmp_fp = warc_tempfile (); + if (warc_tmp_fp == NULL) + { + logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n")); + exit(1); + } + fflush (warc_tmp_fp); + fprintf (warc_tmp_fp, "%s\n", program_argstring); + + warc_write_resource_record (manifest_uuid, + "metadata://gnu.org/software/wget/warc/wget_arguments.txt", + NULL, NULL, NULL, "text/plain", + warc_tmp_fp, -1); + /* warc_write_resource_record has closed warc_tmp_fp. */ + + if (warc_log_fp != NULL) + { + warc_write_resource_record (NULL, + "metadata://gnu.org/software/wget/warc/wget.log", + NULL, manifest_uuid, NULL, "text/plain", + warc_log_fp, -1); + /* warc_write_resource_record has closed warc_log_fp. */ + + warc_log_fp = NULL; + log_set_warc_log_fp (NULL); + } +} + +/* Finishes the WARC writing. + This should be called at the end of the program. */ +void +warc_close (void) +{ + if (warc_current_file != NULL) + { + warc_write_metadata (); + free (warc_current_warcinfo_uuid_str); + fclose (warc_current_file); + } + if (warc_current_cdx_file != NULL) + fclose (warc_current_cdx_file); + if (warc_log_fp != NULL) + { + fclose (warc_log_fp); + log_set_warc_log_fp (NULL); + } +} + +/* Creates a temporary file for writing WARC output. + The temporary file will be created in opt.warc_tempdir. + Returns the pointer to the temporary file, or NULL. */ +FILE * +warc_tempfile (void) +{ + char filename[100]; + if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1) + return NULL; + + int fd = mkstemp (filename); + if (fd < 0) + return NULL; + + if (unlink (filename) < 0) + return NULL; + + return fdopen (fd, "wb+"); +} + + +/* Writes a request record to the WARC file. + url is the target uri of the request, + timestamp_str is the timestamp of the request (generated with warc_timestamp), + record_uuid is the uuid of the request (generated with warc_uuid_str), + body is a pointer to a file containing the request headers and body. + ip is the ip address of the server (or NULL), + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, + ip_address *ip, FILE *body, off_t payload_offset) +{ + warc_write_start_record (); + warc_write_header ("WARC-Type", "request"); + warc_write_header ("WARC-Target-URI", url); + warc_write_header ("Content-Type", "application/http;msgtype=request"); + warc_write_date_header (timestamp_str); + warc_write_header ("WARC-Record-ID", record_uuid); + warc_write_ip_header (ip); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_digest_headers (body, payload_offset); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + + return warc_write_ok; +} + +/* Writes a response record to the CDX file. + url is the target uri of the request/response, + timestamp_str is the timestamp of the request that generated this response, + (generated with warc_timestamp), + mime_type is the mime type of the response body (will be printed to CDX), + response_code is the HTTP response code (will be printed to CDX), + payload_digest is the sha1 digest of the payload, + redirect_location is the contents of the Location: header, or NULL (will be printed to CDX), + offset is the position of the WARC record in the WARC file, + warc_filename is the filename of the WARC, + response_uuid is the uuid of the response. + Returns true on success, false on error. */ +static bool +warc_write_cdx_record (const char *url, const char *timestamp_str, + const char *mime_type, int response_code, + const char *payload_digest, const char *redirect_location, + off_t offset, const char *warc_filename, + const char *response_uuid) +{ + /* Transform the timestamp. */ + char timestamp_str_cdx [15]; + memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */ + memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */ + memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */ + memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */ + memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */ + memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */ + timestamp_str_cdx[14] = '\0'; + + /* Rewrite the checksum. */ + const char *checksum; + if (payload_digest != NULL) + checksum = payload_digest + 5; /* Skip the "sha1:" */ + else + checksum = "-"; + + if (mime_type == NULL || strlen(mime_type) == 0) + mime_type = "-"; + if (redirect_location == NULL || strlen(redirect_location) == 0) + redirect_location = "-"; + + /* Print the CDX line. */ + fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, + timestamp_str_cdx, url, mime_type, response_code, checksum, + redirect_location, offset, warc_current_filename, response_uuid); + fflush (warc_current_cdx_file); + + return true; +} + +/* Writes a revisit record to the WARC file. + url is the target uri of the request/response, + timestamp_str is the timestamp of the request that generated this response + (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the request for that generated this response + (generated with warc_uuid_str), + refers_to_uuid is the uuid of the original response + (generated with warc_uuid_str), + payload_digest is the sha1 digest of the payload, + ip is the ip address of the server (or NULL), + body is a pointer to a file containing the response headers (without payload). + Calling this function will close body. + Returns true on success, false on error. */ +static bool +warc_write_revisit_record (char *url, char *timestamp_str, + char *concurrent_to_uuid, char *payload_digest, + char *refers_to, ip_address *ip, FILE *body) +{ + char revisit_uuid [48]; + warc_uuid_str (revisit_uuid); + + char *block_digest = NULL; + char sha1_res_block[SHA1_DIGEST_SIZE]; + sha1_stream (body, sha1_res_block); + block_digest = warc_base32_sha1_digest (sha1_res_block); + + warc_write_start_record (); + warc_write_header ("WARC-Type", "revisit"); + warc_write_header ("WARC-Record-ID", revisit_uuid); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); + warc_write_header ("WARC-Refers-To", refers_to); + warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"); + warc_write_header ("WARC-Truncated", "length"); + warc_write_header ("WARC-Target-URI", url); + warc_write_date_header (timestamp_str); + warc_write_ip_header (ip); + warc_write_header ("Content-Type", "application/http;msgtype=response"); + warc_write_header ("WARC-Block-Digest", block_digest); + warc_write_header ("WARC-Payload-Digest", payload_digest); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + free (block_digest); + + return warc_write_ok; +} + +/* Writes a response record to the WARC file. + url is the target uri of the request/response, + timestamp_str is the timestamp of the request that generated this response + (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the request for that generated this response + (generated with warc_uuid_str), + ip is the ip address of the server (or NULL), + body is a pointer to a file containing the response headers and body. + mime_type is the mime type of the response body (will be printed to CDX), + response_code is the HTTP response code (will be printed to CDX), + redirect_location is the contents of the Location: header, or NULL (will be printed to CDX), + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_response_record (char *url, char *timestamp_str, + char *concurrent_to_uuid, ip_address *ip, + FILE *body, off_t payload_offset, char *mime_type, + int response_code, char *redirect_location) +{ + char *block_digest = NULL; + char *payload_digest = NULL; + char sha1_res_block[SHA1_DIGEST_SIZE]; + char sha1_res_payload[SHA1_DIGEST_SIZE]; + + if (opt.warc_digests_enabled) + { + /* Calculate the block and payload digests. */ + rewind (body); + if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, + payload_offset) == 0) + { + /* Decide (based on url + payload digest) if we have seen this + data before. */ + struct warc_cdx_record *rec_existing; + rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload); + if (rec_existing != NULL) + { + bool result; + + /* Found an existing record. */ + logprintf (LOG_VERBOSE, + _("Found exact match in CDX file. Saving revisit record to WARC.\n")); + + /* Remove the payload from the file. */ + if (payload_offset > 0) + { + if (ftruncate (fileno (body), payload_offset) == -1) + return false; + } + + /* Send the original payload digest. */ + payload_digest = warc_base32_sha1_digest (sha1_res_payload); + result = warc_write_revisit_record (url, timestamp_str, + concurrent_to_uuid, payload_digest, rec_existing->uuid, + ip, body); + free (payload_digest); + + return result; + } + + block_digest = warc_base32_sha1_digest (sha1_res_block); + payload_digest = warc_base32_sha1_digest (sha1_res_payload); + } + } + + /* Not a revisit, just store the record. */ + + char response_uuid [48]; + warc_uuid_str (response_uuid); + + fseeko (warc_current_file, 0L, SEEK_END); + off_t offset = ftello (warc_current_file); + + warc_write_start_record (); + warc_write_header ("WARC-Type", "response"); + warc_write_header ("WARC-Record-ID", response_uuid); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); + warc_write_header ("WARC-Target-URI", url); + warc_write_date_header (timestamp_str); + warc_write_ip_header (ip); + warc_write_header ("WARC-Block-Digest", block_digest); + warc_write_header ("WARC-Payload-Digest", payload_digest); + warc_write_header ("Content-Type", "application/http;msgtype=response"); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + + if (warc_write_ok && opt.warc_cdx_enabled) + { + /* Add this record to the CDX. */ + warc_write_cdx_record (url, timestamp_str, mime_type, response_code, + payload_digest, redirect_location, offset, warc_current_filename, + response_uuid); + } + + if (block_digest) + free (block_digest); + if (payload_digest) + free (payload_digest); + + return warc_write_ok; +} + +/* Writes a resource record to the WARC file. + resource_uuid is the uuid of the resource (or NULL), + url is the target uri of the resource, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the request for that generated this + resource (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the resource data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_resource_record (char *resource_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, + ip_address *ip, const char *content_type, FILE *body, + off_t payload_offset) +{ + if (resource_uuid == NULL) + { + resource_uuid = alloca (48); + warc_uuid_str (resource_uuid); + } + + if (content_type == NULL) + content_type = "application/octet-stream"; + + warc_write_start_record (); + warc_write_header ("WARC-Type", "resource"); + warc_write_header ("WARC-Record-ID", resource_uuid); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); + warc_write_header ("WARC-Target-URI", url); + warc_write_date_header (timestamp_str); + warc_write_ip_header (ip); + warc_write_digest_headers (body, payload_offset); + warc_write_header ("Content-Type", content_type); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + + return warc_write_ok; +} diff --git a/src/warc.h b/src/warc.h new file mode 100644 index 0000000..eba640d --- /dev/null +++ b/src/warc.h @@ -0,0 +1,23 @@ +/* Declarations of WARC helper methods. */ +#ifndef WARC_H +#define WARC_H + +#include "host.h" + +void warc_init (void); +void warc_close (void); +void warc_timestamp (char *timestamp); +void warc_uuid_str (char *id_str); + +FILE * warc_tempfile (void); + +bool warc_write_request_record (char *url, char *timestamp_str, + char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset); +bool warc_write_response_record (char *url, char *timestamp_str, + char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, + char *mime_type, int response_code, char *redirect_location); +bool warc_write_resource_record (char *resource_uuid, const char *url, + const char *timestamp_str, const char *concurrent_to_uuid, ip_address *ip, + const char *content_type, FILE *body, off_t payload_offset); + +#endif /* WARC_H */ @@ -353,7 +353,9 @@ typedef enum PROXERR, /* 50 */ AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR, - UNLINKERR, NEWLOCATION_KEEP_POST + UNLINKERR, NEWLOCATION_KEEP_POST, CLOSEFAILED, + + WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR } uerr_t; /* 2005-02-19 SMS. |