summaryrefslogtreecommitdiff
path: root/src/warc.c
diff options
context:
space:
mode:
authorDongHun Kwak <dh0128.kwak@samsung.com>2021-03-05 10:08:13 +0900
committerDongHun Kwak <dh0128.kwak@samsung.com>2021-03-05 10:08:13 +0900
commit61fb427c104d6f2334aaee9c7d90d8287d521b17 (patch)
treee4caa30fa69cb51705f22392cab8cf9c04dffb94 /src/warc.c
parent1478f6ab011981e9a986c0c30da680635d3e77bf (diff)
downloadwget-61fb427c104d6f2334aaee9c7d90d8287d521b17.tar.gz
wget-61fb427c104d6f2334aaee9c7d90d8287d521b17.tar.bz2
wget-61fb427c104d6f2334aaee9c7d90d8287d521b17.zip
Imported Upstream version 1.16.1upstream/1.16.1
Diffstat (limited to 'src/warc.c')
-rw-r--r--src/warc.c251
1 files changed, 140 insertions, 111 deletions
diff --git a/src/warc.c b/src/warc.c
index c3a5857..5bdda1b 100644
--- a/src/warc.c
+++ b/src/warc.c
@@ -27,16 +27,18 @@ Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
+#ifndef _GNU_SOURCE
#define _GNU_SOURCE
+#endif
#include "wget.h"
#include "hash.h"
#include "utils.h"
+#include "version.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <strings.h>
#include <time.h>
#include <tmpdir.h>
#include <sha1.h>
@@ -45,8 +47,11 @@ as that of the covered work. */
#ifdef HAVE_LIBZ
#include <zlib.h>
#endif
+
#ifdef HAVE_LIBUUID
#include <uuid/uuid.h>
+#elif HAVE_UUID_CREATE
+#include <uuid.h>
#endif
#ifndef WINDOWS
@@ -56,16 +61,12 @@ as that of the covered work. */
#endif
#include "warc.h"
+#include "exits.h"
#ifndef O_TEMPORARY
#define O_TEMPORARY 0
#endif
-extern char *version_string;
-
-/* Set by main in main.c */
-extern char *program_argstring;
-
/* The log file (a temporary file that contains a copy
of the wget log). */
@@ -108,7 +109,7 @@ static char *warc_current_filename;
static int warc_current_file_number;
/* The table of CDX records, if deduplication is enabled. */
-struct hash_table * warc_cdx_dedup_table;
+static struct hash_table * warc_cdx_dedup_table;
static bool warc_start_new_file (bool meta);
@@ -160,10 +161,12 @@ warc_write_buffer (const char *buffer, size_t size)
static bool
warc_write_string (const char *str)
{
+ size_t n;
+
if (!warc_write_ok)
return false;
- size_t n = strlen (str);
+ n = strlen (str);
if (n != warc_write_buffer (str, n))
warc_write_ok = false;
@@ -252,6 +255,9 @@ warc_write_block_from_file (FILE *data_in)
{
/* Add the Content-Length header. */
char content_length[MAX_INT_TO_STRING_LEN(off_t)];
+ char buffer[BUFSIZ];
+ size_t s;
+
fseeko (data_in, 0L, SEEK_END);
number_to_string (content_length, ftello (data_in));
warc_write_header ("Content-Length", content_length);
@@ -263,8 +269,6 @@ warc_write_block_from_file (FILE *data_in)
warc_write_ok = false;
/* Copy the data in the file to the WARC record. */
- char buffer[BUFSIZ];
- size_t s;
while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0)
{
if (warc_write_buffer (buffer, s) < s)
@@ -289,6 +293,11 @@ warc_write_end_record (void)
/* We start a new gzip stream for each record. */
if (warc_write_ok && warc_current_gzfile)
{
+ char extra_header[EXTRA_GZIP_HEADER_SIZE];
+ char static_header[GZIP_STATIC_HEADER_SIZE];
+ off_t current_offset, uncompressed_size, compressed_size;
+ size_t result;
+
if (gzclose (warc_current_gzfile) != Z_OK)
{
warc_write_ok = false;
@@ -314,17 +323,16 @@ warc_write_end_record (void)
*/
/* Calculate the uncompressed and compressed sizes. */
- off_t current_offset = ftello (warc_current_file);
- off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
- off_t compressed_size = warc_current_gzfile_uncompressed_size;
+ current_offset = ftello (warc_current_file);
+ uncompressed_size = current_offset - warc_current_gzfile_offset;
+ compressed_size = warc_current_gzfile_uncompressed_size;
/* Go back to the static GZIP header. */
fseeko (warc_current_file, warc_current_gzfile_offset
+ EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
/* Read the header. */
- char static_header[GZIP_STATIC_HEADER_SIZE];
- size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
+ result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE,
warc_current_file);
if (result != GZIP_STATIC_HEADER_SIZE)
{
@@ -341,7 +349,6 @@ warc_write_end_record (void)
fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
/* Prepare the extra GZIP header. */
- char extra_header[EXTRA_GZIP_HEADER_SIZE];
/* XLEN, the length of the extra header fields. */
extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255);
extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255;
@@ -383,13 +390,10 @@ warc_write_end_record (void)
static bool
warc_write_date_header (const char *timestamp)
{
- if (timestamp == NULL)
- {
- char current_timestamp[21];
- warc_timestamp (current_timestamp);
- timestamp = current_timestamp;
- }
- return warc_write_header ("WARC-Date", timestamp);
+ char current_timestamp[21];
+
+ return warc_write_header ("WARC-Date", timestamp ? timestamp :
+ warc_timestamp (current_timestamp, sizeof(current_timestamp)));
}
/* Writes the WARC-IP-Address header for the given IP to
@@ -465,7 +469,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
or EWOULDBLOCK. */
if (ferror (stream))
{
- free (buffer);
+ xfree (buffer);
return 1;
}
goto process_partial_block;
@@ -525,7 +529,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
sha1_finish_ctx (&ctx_block, res_block);
if (payload_offset >= 0)
sha1_finish_ctx (&ctx_payload, res_payload);
- free (buffer);
+ xfree (buffer);
return 0;
#undef BLOCKSIZE
@@ -567,13 +571,13 @@ warc_write_digest_headers (FILE *file, long payload_offset)
digest = warc_base32_sha1_digest (sha1_res_block);
warc_write_header ("WARC-Block-Digest", digest);
- free (digest);
+ xfree (digest);
if (payload_offset >= 0)
{
digest = warc_base32_sha1_digest (sha1_res_payload);
warc_write_header ("WARC-Payload-Digest", digest);
- free (digest);
+ xfree (digest);
}
}
}
@@ -584,17 +588,19 @@ warc_write_digest_headers (FILE *file, long payload_offset)
The UTC time is formatted following ISO 8601, as required
for use in the WARC-Date header.
The timestamp will be 21 characters long. */
-void
-warc_timestamp (char *timestamp)
+char *
+warc_timestamp (char *timestamp, size_t timestamp_size)
{
- time_t rawtime;
- struct tm * timeinfo;
- time ( &rawtime );
- timeinfo = gmtime (&rawtime);
- strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo);
+ time_t rawtime = time (NULL);
+ struct tm * timeinfo = gmtime (&rawtime);
+
+ if (strftime (timestamp, timestamp_size, "%Y-%m-%dT%H:%M:%SZ", timeinfo) == 0 && timestamp_size > 0)
+ *timestamp = 0;
+
+ return timestamp;
}
-#ifdef HAVE_LIBUUID
+#if HAVE_LIBUUID || HAVE_UUID_CREATE
/* Fills urn_str with a UUID in the format required
for the WARC-Record-Id header.
The string will be 47 characters long. */
@@ -604,8 +610,13 @@ warc_uuid_str (char *urn_str)
char uuid_str[37];
uuid_t record_id;
+#if HAVE_UUID_CREATE
+ uuid_create (&record_id, NULL);
+ uuid_to_string (&record_id, &uuid_str, NULL);
+#else
uuid_generate (record_id);
uuid_unparse (record_id, uuid_str);
+#endif
sprintf (urn_str, "<urn:uuid:%s>", uuid_str);
}
@@ -621,19 +632,19 @@ warc_uuid_str (char *urn_str)
void
warc_uuid_str (char *urn_str)
{
- // RFC 4122, a version 4 UUID with only random numbers
+ /* RFC 4122, a version 4 UUID with only random numbers */
unsigned char uuid_data[16];
int i;
for (i=0; i<16; i++)
uuid_data[i] = random_number (255);
- // Set the four most significant bits (bits 12 through 15) of the
- // time_hi_and_version field to the 4-bit version number
+ /* Set the four most significant bits (bits 12 through 15) of the
+ * time_hi_and_version field to the 4-bit version number */
uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40;
- // Set the two most significant bits (bits 6 and 7) of the
- // clock_seq_hi_and_reserved to zero and one, respectively.
+ /* Set the two most significant bits (bits 6 and 7) of the
+ * clock_seq_hi_and_reserved to zero and one, respectively. */
uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80;
sprintf (urn_str,
@@ -650,16 +661,18 @@ warc_uuid_str (char *urn_str)
static bool
warc_write_warcinfo_record (char *filename)
{
+ FILE *warc_tmp;
+ char timestamp[22];
+ char *filename_copy, *filename_basename;
+
/* Write warc-info record as the first record of the file. */
/* We add the record id of this info record to the other records in the
file. */
warc_current_warcinfo_uuid_str = (char *) malloc (48);
warc_uuid_str (warc_current_warcinfo_uuid_str);
- char timestamp[22];
- warc_timestamp (timestamp);
+ warc_timestamp (timestamp, sizeof(timestamp));
- char *filename_copy, *filename_basename;
filename_copy = strdup (filename);
filename_basename = strdup (basename (filename_copy));
@@ -671,11 +684,11 @@ warc_write_warcinfo_record (char *filename)
warc_write_header ("WARC-Filename", filename_basename);
/* Create content. */
- FILE *warc_tmp = warc_tempfile ();
+ warc_tmp = warc_tempfile ();
if (warc_tmp == NULL)
{
- free (filename_copy);
- free (filename_basename);
+ xfree (filename_copy);
+ xfree (filename_basename);
return false;
}
@@ -701,8 +714,8 @@ warc_write_warcinfo_record (char *filename)
if (! warc_write_ok)
logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n"));
- free (filename_copy);
- free (filename_basename);
+ xfree (filename_copy);
+ xfree (filename_basename);
fclose (warc_tmp);
return warc_write_ok;
}
@@ -721,22 +734,6 @@ warc_write_warcinfo_record (char *filename)
static bool
warc_start_new_file (bool meta)
{
- if (opt.warc_filename == NULL)
- return false;
-
- if (warc_current_file != NULL)
- fclose (warc_current_file);
-
- free (warc_current_warcinfo_uuid_str);
- free (warc_current_filename);
-
- warc_current_file_number++;
-
- int base_filename_length = strlen (opt.warc_filename);
- /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
- char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
- warc_current_filename = new_filename;
-
#ifdef __VMS
# define WARC_GZ "warc-gz"
#else /* def __VMS */
@@ -749,6 +746,25 @@ warc_start_new_file (bool meta)
const char *extension = "warc";
#endif
+ int base_filename_length;
+ char *new_filename;
+
+ if (opt.warc_filename == NULL)
+ return false;
+
+ if (warc_current_file != NULL)
+ fclose (warc_current_file);
+
+ xfree (warc_current_warcinfo_uuid_str);
+ xfree (warc_current_filename);
+
+ warc_current_file_number++;
+
+ base_filename_length = strlen (opt.warc_filename);
+ /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */
+ new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1);
+ warc_current_filename = new_filename;
+
/* If max size is enabled, we add a serial number to the file names. */
if (meta)
sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension);
@@ -820,12 +836,13 @@ static bool
warc_parse_cdx_header (char *lineptr, int *field_num_original_url,
int *field_num_checksum, int *field_num_record_id)
{
+ char *token;
+ char *save_ptr;
+
*field_num_original_url = -1;
*field_num_checksum = -1;
*field_num_record_id = -1;
- char *token;
- char *save_ptr;
token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
if (token != NULL && strcmp (token, "CDX") == 0)
@@ -866,13 +883,12 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
char *original_url = NULL;
char *checksum = NULL;
char *record_id = NULL;
-
char *token;
char *save_ptr;
- token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
+ int field_num = 0;
/* Read this line to get the fields we need. */
- int field_num = 0;
+ token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr);
while (token != NULL)
{
char **val;
@@ -901,7 +917,7 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
char * checksum_v;
base32_decode_alloc (checksum, strlen (checksum), &checksum_v,
&checksum_l);
- free (checksum);
+ xfree (checksum);
if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE)
{
@@ -912,20 +928,20 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
rec->uuid = record_id;
memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE);
hash_table_put (warc_cdx_dedup_table, rec->digest, rec);
- free (checksum_v);
+ xfree (checksum_v);
}
else
{
- free (original_url);
- free (checksum_v);
- free (record_id);
+ xfree (original_url);
+ xfree (checksum_v);
+ xfree (record_id);
}
}
else
{
- xfree_null(checksum);
- xfree_null(original_url);
- xfree_null(record_id);
+ xfree(checksum);
+ xfree(original_url);
+ xfree(record_id);
}
}
@@ -934,17 +950,17 @@ warc_process_cdx_line (char *lineptr, int field_num_original_url,
static bool
warc_load_cdx_dedup_file (void)
{
- FILE *f = fopen (opt.warc_cdx_dedup_filename, "r");
- if (f == NULL)
- return false;
-
+ FILE *f;
+ char *lineptr = NULL;
+ size_t n = 0;
+ ssize_t line_length;
int field_num_original_url = -1;
int field_num_checksum = -1;
int field_num_record_id = -1;
- char *lineptr = NULL;
- size_t n = 0;
- ssize_t line_length;
+ f = fopen (opt.warc_cdx_dedup_filename, "r");
+ if (f == NULL)
+ return false;
/* The first line should contain the CDX header.
Format: " CDX x x x x x"
@@ -973,6 +989,8 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
}
else
{
+ int nrecords;
+
/* Initialize the table. */
warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest,
warc_cmp_sha1_digest);
@@ -990,14 +1008,14 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
while (line_length != -1);
/* Print results. */
- int nrecords = hash_table_count (warc_cdx_dedup_table);
+ nrecords = hash_table_count (warc_cdx_dedup_table);
logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n",
"Loaded %d records from CDX.\n\n",
nrecords),
nrecords);
}
- free (lineptr);
+ xfree (lineptr);
fclose (f);
return true;
@@ -1010,11 +1028,12 @@ _("CDX file does not list record ids. (Missing column 'u'.)\n"));
static struct warc_cdx_record *
warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload)
{
+ struct warc_cdx_record *rec_existing;
+
if (warc_cdx_dedup_table == NULL)
return NULL;
- struct warc_cdx_record *rec_existing
- = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
+ rec_existing = hash_table_get (warc_cdx_dedup_table, sha1_digest_payload);
if (rec_existing && strcmp (rec_existing->url, url) == 0)
return rec_existing;
@@ -1038,7 +1057,7 @@ warc_init (void)
logprintf (LOG_NOTQUIET,
_("Could not read CDX file %s for deduplication.\n"),
quote (opt.warc_cdx_dedup_filename));
- exit(1);
+ exit (WGET_EXIT_GENERIC_ERROR);
}
}
@@ -1047,7 +1066,7 @@ warc_init (void)
{
logprintf (LOG_NOTQUIET,
_("Could not open temporary WARC manifest file.\n"));
- exit(1);
+ exit (WGET_EXIT_GENERIC_ERROR);
}
if (opt.warc_keep_log)
@@ -1057,7 +1076,7 @@ warc_init (void)
{
logprintf (LOG_NOTQUIET,
_("Could not open temporary WARC log file.\n"));
- exit(1);
+ exit (WGET_EXIT_GENERIC_ERROR);
}
log_set_warc_log_fp (warc_log_fp);
}
@@ -1066,7 +1085,7 @@ warc_init (void)
if (! warc_start_new_file (false))
{
logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n"));
- exit(1);
+ exit (WGET_EXIT_GENERIC_ERROR);
}
if (opt.warc_cdx_enabled)
@@ -1075,7 +1094,7 @@ warc_init (void)
{
logprintf (LOG_NOTQUIET,
_("Could not open CDX file for output.\n"));
- exit(1);
+ exit (WGET_EXIT_GENERIC_ERROR);
}
}
}
@@ -1085,11 +1104,13 @@ warc_init (void)
static void
warc_write_metadata (void)
{
+ char manifest_uuid[48];
+ FILE *warc_tmp_fp;
+
/* If there are multiple WARC files, the metadata should be written to a separate file. */
if (opt.warc_maxsize > 0)
warc_start_new_file (true);
- char manifest_uuid [48];
warc_uuid_str (manifest_uuid);
fflush (warc_manifest_fp);
@@ -1099,11 +1120,11 @@ warc_write_metadata (void)
warc_manifest_fp, -1);
/* warc_write_resource_record has closed warc_manifest_fp. */
- FILE * warc_tmp_fp = warc_tempfile ();
+ warc_tmp_fp = warc_tempfile ();
if (warc_tmp_fp == NULL)
{
logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n"));
- exit(1);
+ exit (WGET_EXIT_GENERIC_ERROR);
}
fflush (warc_tmp_fp);
fprintf (warc_tmp_fp, "%s\n", program_argstring);
@@ -1135,7 +1156,7 @@ warc_close (void)
if (warc_current_file != NULL)
{
warc_write_metadata ();
- free (warc_current_warcinfo_uuid_str);
+ xfree (warc_current_warcinfo_uuid_str);
fclose (warc_current_file);
}
if (warc_current_cdx_file != NULL)
@@ -1154,6 +1175,8 @@ FILE *
warc_tempfile (void)
{
char filename[100];
+ int fd;
+
if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1)
return NULL;
@@ -1172,13 +1195,16 @@ warc_tempfile (void)
return fopen (tfn, "w+", "fop=tmd"); /* Create auto-delete temp file. */
}
#else /* def __VMS */
- int fd = mkostemp (filename, O_TEMPORARY);
+ fd = mkostemp (filename, O_TEMPORARY);
if (fd < 0)
return NULL;
#if !O_TEMPORARY
if (unlink (filename) < 0)
- return NULL;
+ {
+ close(fd);
+ return NULL;
+ }
#endif
return fdopen (fd, "wb+");
@@ -1231,11 +1257,14 @@ static bool
warc_write_cdx_record (const char *url, const char *timestamp_str,
const char *mime_type, int response_code,
const char *payload_digest, const char *redirect_location,
- off_t offset, const char *warc_filename,
+ off_t offset, const char *warc_filename _GL_UNUSED,
const char *response_uuid)
{
/* Transform the timestamp. */
- char timestamp_str_cdx [15];
+ char timestamp_str_cdx[15];
+ char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
+ const char *checksum;
+
memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */
memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */
memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */
@@ -1245,7 +1274,6 @@ warc_write_cdx_record (const char *url, const char *timestamp_str,
timestamp_str_cdx[14] = '\0';
/* Rewrite the checksum. */
- const char *checksum;
if (payload_digest != NULL)
checksum = payload_digest + 5; /* Skip the "sha1:" */
else
@@ -1256,7 +1284,6 @@ warc_write_cdx_record (const char *url, const char *timestamp_str,
if (redirect_location == NULL || strlen(redirect_location) == 0)
redirect_location = "-";
- char offset_string[MAX_INT_TO_STRING_LEN(off_t)];
number_to_string (offset_string, offset);
/* Print the CDX line. */
@@ -1288,10 +1315,11 @@ warc_write_revisit_record (char *url, char *timestamp_str,
char *refers_to, ip_address *ip, FILE *body)
{
char revisit_uuid [48];
- warc_uuid_str (revisit_uuid);
-
char *block_digest = NULL;
char sha1_res_block[SHA1_DIGEST_SIZE];
+
+ warc_uuid_str (revisit_uuid);
+
sha1_stream (body, sha1_res_block);
block_digest = warc_base32_sha1_digest (sha1_res_block);
@@ -1313,7 +1341,7 @@ warc_write_revisit_record (char *url, char *timestamp_str,
warc_write_end_record ();
fclose (body);
- free (block_digest);
+ xfree (block_digest);
return warc_write_ok;
}
@@ -1341,6 +1369,8 @@ warc_write_response_record (char *url, char *timestamp_str,
char *payload_digest = NULL;
char sha1_res_block[SHA1_DIGEST_SIZE];
char sha1_res_payload[SHA1_DIGEST_SIZE];
+ char response_uuid [48];
+ off_t offset;
if (opt.warc_digests_enabled)
{
@@ -1373,7 +1403,7 @@ warc_write_response_record (char *url, char *timestamp_str,
result = warc_write_revisit_record (url, timestamp_str,
concurrent_to_uuid, payload_digest, rec_existing->uuid,
ip, body);
- free (payload_digest);
+ xfree (payload_digest);
return result;
}
@@ -1385,11 +1415,10 @@ warc_write_response_record (char *url, char *timestamp_str,
/* Not a revisit, just store the record. */
- char response_uuid [48];
warc_uuid_str (response_uuid);
fseeko (warc_current_file, 0L, SEEK_END);
- off_t offset = ftello (warc_current_file);
+ offset = ftello (warc_current_file);
warc_write_start_record ();
warc_write_header ("WARC-Type", "response");
@@ -1415,8 +1444,8 @@ warc_write_response_record (char *url, char *timestamp_str,
response_uuid);
}
- free (block_digest);
- free (payload_digest);
+ xfree (block_digest);
+ xfree (payload_digest);
return warc_write_ok;
}