diff options
author | DongHun Kwak <dh0128.kwak@samsung.com> | 2021-03-05 10:08:13 +0900 |
---|---|---|
committer | DongHun Kwak <dh0128.kwak@samsung.com> | 2021-03-05 10:08:13 +0900 |
commit | 61fb427c104d6f2334aaee9c7d90d8287d521b17 (patch) | |
tree | e4caa30fa69cb51705f22392cab8cf9c04dffb94 /src/warc.c | |
parent | 1478f6ab011981e9a986c0c30da680635d3e77bf (diff) | |
download | wget-61fb427c104d6f2334aaee9c7d90d8287d521b17.tar.gz wget-61fb427c104d6f2334aaee9c7d90d8287d521b17.tar.bz2 wget-61fb427c104d6f2334aaee9c7d90d8287d521b17.zip |
Imported Upstream version 1.16.1upstream/1.16.1
Diffstat (limited to 'src/warc.c')
-rw-r--r-- | src/warc.c | 251 |
1 files changed, 140 insertions, 111 deletions
@@ -27,16 +27,18 @@ Corresponding Source for a non-source form of such a combination shall include the source code for the parts of OpenSSL used as well as that of the covered work. */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include "wget.h" #include "hash.h" #include "utils.h" +#include "version.h" #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <strings.h> #include <time.h> #include <tmpdir.h> #include <sha1.h> @@ -45,8 +47,11 @@ as that of the covered work. */ #ifdef HAVE_LIBZ #include <zlib.h> #endif + #ifdef HAVE_LIBUUID #include <uuid/uuid.h> +#elif HAVE_UUID_CREATE +#include <uuid.h> #endif #ifndef WINDOWS @@ -56,16 +61,12 @@ as that of the covered work. */ #endif #include "warc.h" +#include "exits.h" #ifndef O_TEMPORARY #define O_TEMPORARY 0 #endif -extern char *version_string; - -/* Set by main in main.c */ -extern char *program_argstring; - /* The log file (a temporary file that contains a copy of the wget log). */ @@ -108,7 +109,7 @@ static char *warc_current_filename; static int warc_current_file_number; /* The table of CDX records, if deduplication is enabled. */ -struct hash_table * warc_cdx_dedup_table; +static struct hash_table * warc_cdx_dedup_table; static bool warc_start_new_file (bool meta); @@ -160,10 +161,12 @@ warc_write_buffer (const char *buffer, size_t size) static bool warc_write_string (const char *str) { + size_t n; + if (!warc_write_ok) return false; - size_t n = strlen (str); + n = strlen (str); if (n != warc_write_buffer (str, n)) warc_write_ok = false; @@ -252,6 +255,9 @@ warc_write_block_from_file (FILE *data_in) { /* Add the Content-Length header. */ char content_length[MAX_INT_TO_STRING_LEN(off_t)]; + char buffer[BUFSIZ]; + size_t s; + fseeko (data_in, 0L, SEEK_END); number_to_string (content_length, ftello (data_in)); warc_write_header ("Content-Length", content_length); @@ -263,8 +269,6 @@ warc_write_block_from_file (FILE *data_in) warc_write_ok = false; /* Copy the data in the file to the WARC record. */ - char buffer[BUFSIZ]; - size_t s; while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0) { if (warc_write_buffer (buffer, s) < s) @@ -289,6 +293,11 @@ warc_write_end_record (void) /* We start a new gzip stream for each record. */ if (warc_write_ok && warc_current_gzfile) { + char extra_header[EXTRA_GZIP_HEADER_SIZE]; + char static_header[GZIP_STATIC_HEADER_SIZE]; + off_t current_offset, uncompressed_size, compressed_size; + size_t result; + if (gzclose (warc_current_gzfile) != Z_OK) { warc_write_ok = false; @@ -314,17 +323,16 @@ warc_write_end_record (void) */ /* Calculate the uncompressed and compressed sizes. */ - off_t current_offset = ftello (warc_current_file); - off_t uncompressed_size = current_offset - warc_current_gzfile_offset; - off_t compressed_size = warc_current_gzfile_uncompressed_size; + current_offset = ftello (warc_current_file); + uncompressed_size = current_offset - warc_current_gzfile_offset; + compressed_size = warc_current_gzfile_uncompressed_size; /* Go back to the static GZIP header. */ fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET); /* Read the header. */ - char static_header[GZIP_STATIC_HEADER_SIZE]; - size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, + result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); if (result != GZIP_STATIC_HEADER_SIZE) { @@ -341,7 +349,6 @@ warc_write_end_record (void) fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); /* Prepare the extra GZIP header. */ - char extra_header[EXTRA_GZIP_HEADER_SIZE]; /* XLEN, the length of the extra header fields. */ extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255); extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255; @@ -383,13 +390,10 @@ warc_write_end_record (void) static bool warc_write_date_header (const char *timestamp) { - if (timestamp == NULL) - { - char current_timestamp[21]; - warc_timestamp (current_timestamp); - timestamp = current_timestamp; - } - return warc_write_header ("WARC-Date", timestamp); + char current_timestamp[21]; + + return warc_write_header ("WARC-Date", timestamp ? timestamp : + warc_timestamp (current_timestamp, sizeof(current_timestamp))); } /* Writes the WARC-IP-Address header for the given IP to @@ -465,7 +469,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, or EWOULDBLOCK. */ if (ferror (stream)) { - free (buffer); + xfree (buffer); return 1; } goto process_partial_block; @@ -525,7 +529,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, sha1_finish_ctx (&ctx_block, res_block); if (payload_offset >= 0) sha1_finish_ctx (&ctx_payload, res_payload); - free (buffer); + xfree (buffer); return 0; #undef BLOCKSIZE @@ -567,13 +571,13 @@ warc_write_digest_headers (FILE *file, long payload_offset) digest = warc_base32_sha1_digest (sha1_res_block); warc_write_header ("WARC-Block-Digest", digest); - free (digest); + xfree (digest); if (payload_offset >= 0) { digest = warc_base32_sha1_digest (sha1_res_payload); warc_write_header ("WARC-Payload-Digest", digest); - free (digest); + xfree (digest); } } } @@ -584,17 +588,19 @@ warc_write_digest_headers (FILE *file, long payload_offset) The UTC time is formatted following ISO 8601, as required for use in the WARC-Date header. The timestamp will be 21 characters long. */ -void -warc_timestamp (char *timestamp) +char * +warc_timestamp (char *timestamp, size_t timestamp_size) { - time_t rawtime; - struct tm * timeinfo; - time ( &rawtime ); - timeinfo = gmtime (&rawtime); - strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo); + time_t rawtime = time (NULL); + struct tm * timeinfo = gmtime (&rawtime); + + if (strftime (timestamp, timestamp_size, "%Y-%m-%dT%H:%M:%SZ", timeinfo) == 0 && timestamp_size > 0) + *timestamp = 0; + + return timestamp; } -#ifdef HAVE_LIBUUID +#if HAVE_LIBUUID || HAVE_UUID_CREATE /* Fills urn_str with a UUID in the format required for the WARC-Record-Id header. The string will be 47 characters long. */ @@ -604,8 +610,13 @@ warc_uuid_str (char *urn_str) char uuid_str[37]; uuid_t record_id; +#if HAVE_UUID_CREATE + uuid_create (&record_id, NULL); + uuid_to_string (&record_id, &uuid_str, NULL); +#else uuid_generate (record_id); uuid_unparse (record_id, uuid_str); +#endif sprintf (urn_str, "<urn:uuid:%s>", uuid_str); } @@ -621,19 +632,19 @@ warc_uuid_str (char *urn_str) void warc_uuid_str (char *urn_str) { - // RFC 4122, a version 4 UUID with only random numbers + /* RFC 4122, a version 4 UUID with only random numbers */ unsigned char uuid_data[16]; int i; for (i=0; i<16; i++) uuid_data[i] = random_number (255); - // Set the four most significant bits (bits 12 through 15) of the - // time_hi_and_version field to the 4-bit version number + /* Set the four most significant bits (bits 12 through 15) of the + * time_hi_and_version field to the 4-bit version number */ uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40; - // Set the two most significant bits (bits 6 and 7) of the - // clock_seq_hi_and_reserved to zero and one, respectively. + /* Set the two most significant bits (bits 6 and 7) of the + * clock_seq_hi_and_reserved to zero and one, respectively. */ uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80; sprintf (urn_str, @@ -650,16 +661,18 @@ warc_uuid_str (char *urn_str) static bool warc_write_warcinfo_record (char *filename) { + FILE *warc_tmp; + char timestamp[22]; + char *filename_copy, *filename_basename; + /* Write warc-info record as the first record of the file. */ /* We add the record id of this info record to the other records in the file. */ warc_current_warcinfo_uuid_str = (char *) malloc (48); warc_uuid_str (warc_current_warcinfo_uuid_str); - char timestamp[22]; - warc_timestamp (timestamp); + warc_timestamp (timestamp, sizeof(timestamp)); - char *filename_copy, *filename_basename; filename_copy = strdup (filename); filename_basename = strdup (basename (filename_copy)); @@ -671,11 +684,11 @@ warc_write_warcinfo_record (char *filename) warc_write_header ("WARC-Filename", filename_basename); /* Create content. */ - FILE *warc_tmp = warc_tempfile (); + warc_tmp = warc_tempfile (); if (warc_tmp == NULL) { - free (filename_copy); - free (filename_basename); + xfree (filename_copy); + xfree (filename_basename); return false; } @@ -701,8 +714,8 @@ warc_write_warcinfo_record (char *filename) if (! warc_write_ok) logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n")); - free (filename_copy); - free (filename_basename); + xfree (filename_copy); + xfree (filename_basename); fclose (warc_tmp); return warc_write_ok; } @@ -721,22 +734,6 @@ warc_write_warcinfo_record (char *filename) static bool warc_start_new_file (bool meta) { - if (opt.warc_filename == NULL) - return false; - - if (warc_current_file != NULL) - fclose (warc_current_file); - - free (warc_current_warcinfo_uuid_str); - free (warc_current_filename); - - warc_current_file_number++; - - int base_filename_length = strlen (opt.warc_filename); - /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */ - char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); - warc_current_filename = new_filename; - #ifdef __VMS # define WARC_GZ "warc-gz" #else /* def __VMS */ @@ -749,6 +746,25 @@ warc_start_new_file (bool meta) const char *extension = "warc"; #endif + int base_filename_length; + char *new_filename; + + if (opt.warc_filename == NULL) + return false; + + if (warc_current_file != NULL) + fclose (warc_current_file); + + xfree (warc_current_warcinfo_uuid_str); + xfree (warc_current_filename); + + warc_current_file_number++; + + base_filename_length = strlen (opt.warc_filename); + /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */ + new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); + warc_current_filename = new_filename; + /* If max size is enabled, we add a serial number to the file names. */ if (meta) sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension); @@ -820,12 +836,13 @@ static bool warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id) { + char *token; + char *save_ptr; + *field_num_original_url = -1; *field_num_checksum = -1; *field_num_record_id = -1; - char *token; - char *save_ptr; token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); if (token != NULL && strcmp (token, "CDX") == 0) @@ -866,13 +883,12 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, char *original_url = NULL; char *checksum = NULL; char *record_id = NULL; - char *token; char *save_ptr; - token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); + int field_num = 0; /* Read this line to get the fields we need. */ - int field_num = 0; + token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); while (token != NULL) { char **val; @@ -901,7 +917,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, char * checksum_v; base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l); - free (checksum); + xfree (checksum); if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) { @@ -912,20 +928,20 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, rec->uuid = record_id; memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE); hash_table_put (warc_cdx_dedup_table, rec->digest, rec); - free (checksum_v); + xfree (checksum_v); } else { - free (original_url); - free (checksum_v); - free (record_id); + xfree (original_url); + xfree (checksum_v); + xfree (record_id); } } else { - xfree_null(checksum); - xfree_null(original_url); - xfree_null(record_id); + xfree(checksum); + xfree(original_url); + xfree(record_id); } } @@ -934,17 +950,17 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url, static bool warc_load_cdx_dedup_file (void) { - FILE *f = fopen (opt.warc_cdx_dedup_filename, "r"); - if (f == NULL) - return false; - + FILE *f; + char *lineptr = NULL; + size_t n = 0; + ssize_t line_length; int field_num_original_url = -1; int field_num_checksum = -1; int field_num_record_id = -1; - char *lineptr = NULL; - size_t n = 0; - ssize_t line_length; + f = fopen (opt.warc_cdx_dedup_filename, "r"); + if (f == NULL) + return false; /* The first line should contain the CDX header. Format: " CDX x x x x x" @@ -973,6 +989,8 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); } else { + int nrecords; + /* Initialize the table. */ warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest); @@ -990,14 +1008,14 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); while (line_length != -1); /* Print results. */ - int nrecords = hash_table_count (warc_cdx_dedup_table); + nrecords = hash_table_count (warc_cdx_dedup_table); logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n", "Loaded %d records from CDX.\n\n", nrecords), nrecords); } - free (lineptr); + xfree (lineptr); fclose (f); return true; @@ -1010,11 +1028,12 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n")); static struct warc_cdx_record * warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) { + struct warc_cdx_record *rec_existing; + if (warc_cdx_dedup_table == NULL) return NULL; - struct warc_cdx_record *rec_existing - = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload); + rec_existing = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload); if (rec_existing && strcmp (rec_existing->url, url) == 0) return rec_existing; @@ -1038,7 +1057,7 @@ warc_init (void) logprintf (LOG_NOTQUIET, _("Could not read CDX file %s for deduplication.\n"), quote (opt.warc_cdx_dedup_filename)); - exit(1); + exit (WGET_EXIT_GENERIC_ERROR); } } @@ -1047,7 +1066,7 @@ warc_init (void) { logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n")); - exit(1); + exit (WGET_EXIT_GENERIC_ERROR); } if (opt.warc_keep_log) @@ -1057,7 +1076,7 @@ warc_init (void) { logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n")); - exit(1); + exit (WGET_EXIT_GENERIC_ERROR); } log_set_warc_log_fp (warc_log_fp); } @@ -1066,7 +1085,7 @@ warc_init (void) if (! warc_start_new_file (false)) { logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n")); - exit(1); + exit (WGET_EXIT_GENERIC_ERROR); } if (opt.warc_cdx_enabled) @@ -1075,7 +1094,7 @@ warc_init (void) { logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n")); - exit(1); + exit (WGET_EXIT_GENERIC_ERROR); } } } @@ -1085,11 +1104,13 @@ warc_init (void) static void warc_write_metadata (void) { + char manifest_uuid[48]; + FILE *warc_tmp_fp; + /* If there are multiple WARC files, the metadata should be written to a separate file. */ if (opt.warc_maxsize > 0) warc_start_new_file (true); - char manifest_uuid [48]; warc_uuid_str (manifest_uuid); fflush (warc_manifest_fp); @@ -1099,11 +1120,11 @@ warc_write_metadata (void) warc_manifest_fp, -1); /* warc_write_resource_record has closed warc_manifest_fp. */ - FILE * warc_tmp_fp = warc_tempfile (); + warc_tmp_fp = warc_tempfile (); if (warc_tmp_fp == NULL) { logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n")); - exit(1); + exit (WGET_EXIT_GENERIC_ERROR); } fflush (warc_tmp_fp); fprintf (warc_tmp_fp, "%s\n", program_argstring); @@ -1135,7 +1156,7 @@ warc_close (void) if (warc_current_file != NULL) { warc_write_metadata (); - free (warc_current_warcinfo_uuid_str); + xfree (warc_current_warcinfo_uuid_str); fclose (warc_current_file); } if (warc_current_cdx_file != NULL) @@ -1154,6 +1175,8 @@ FILE * warc_tempfile (void) { char filename[100]; + int fd; + if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1) return NULL; @@ -1172,13 +1195,16 @@ warc_tempfile (void) return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */ } #else /* def __VMS */ - int fd = mkostemp (filename, O_TEMPORARY); + fd = mkostemp (filename, O_TEMPORARY); if (fd < 0) return NULL; #if !O_TEMPORARY if (unlink (filename) < 0) - return NULL; + { + close(fd); + return NULL; + } #endif return fdopen (fd, "wb+"); @@ -1231,11 +1257,14 @@ static bool warc_write_cdx_record (const char *url, const char *timestamp_str, const char *mime_type, int response_code, const char *payload_digest, const char *redirect_location, - off_t offset, const char *warc_filename, + off_t offset, const char *warc_filename _GL_UNUSED, const char *response_uuid) { /* Transform the timestamp. */ - char timestamp_str_cdx [15]; + char timestamp_str_cdx[15]; + char offset_string[MAX_INT_TO_STRING_LEN(off_t)]; + const char *checksum; + memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */ memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */ memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */ @@ -1245,7 +1274,6 @@ warc_write_cdx_record (const char *url, const char *timestamp_str, timestamp_str_cdx[14] = '\0'; /* Rewrite the checksum. */ - const char *checksum; if (payload_digest != NULL) checksum = payload_digest + 5; /* Skip the "sha1:" */ else @@ -1256,7 +1284,6 @@ warc_write_cdx_record (const char *url, const char *timestamp_str, if (redirect_location == NULL || strlen(redirect_location) == 0) redirect_location = "-"; - char offset_string[MAX_INT_TO_STRING_LEN(off_t)]; number_to_string (offset_string, offset); /* Print the CDX line. */ @@ -1288,10 +1315,11 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *refers_to, ip_address *ip, FILE *body) { char revisit_uuid [48]; - warc_uuid_str (revisit_uuid); - char *block_digest = NULL; char sha1_res_block[SHA1_DIGEST_SIZE]; + + warc_uuid_str (revisit_uuid); + sha1_stream (body, sha1_res_block); block_digest = warc_base32_sha1_digest (sha1_res_block); @@ -1313,7 +1341,7 @@ warc_write_revisit_record (char *url, char *timestamp_str, warc_write_end_record (); fclose (body); - free (block_digest); + xfree (block_digest); return warc_write_ok; } @@ -1341,6 +1369,8 @@ warc_write_response_record (char *url, char *timestamp_str, char *payload_digest = NULL; char sha1_res_block[SHA1_DIGEST_SIZE]; char sha1_res_payload[SHA1_DIGEST_SIZE]; + char response_uuid [48]; + off_t offset; if (opt.warc_digests_enabled) { @@ -1373,7 +1403,7 @@ warc_write_response_record (char *url, char *timestamp_str, result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body); - free (payload_digest); + xfree (payload_digest); return result; } @@ -1385,11 +1415,10 @@ warc_write_response_record (char *url, char *timestamp_str, /* Not a revisit, just store the record. */ - char response_uuid [48]; warc_uuid_str (response_uuid); fseeko (warc_current_file, 0L, SEEK_END); - off_t offset = ftello (warc_current_file); + offset = ftello (warc_current_file); warc_write_start_record (); warc_write_header ("WARC-Type", "response"); @@ -1415,8 +1444,8 @@ warc_write_response_record (char *url, char *timestamp_str, response_uuid); } - free (block_digest); - free (payload_digest); + xfree (block_digest); + xfree (payload_digest); return warc_write_ok; } |