diff options
Diffstat (limited to 'tsp')
-rwxr-xr-x | tsp/jobs/trigger_downloads.sh | 31 |
1 files changed, 14 insertions, 17 deletions
diff --git a/tsp/jobs/trigger_downloads.sh b/tsp/jobs/trigger_downloads.sh index d765e4a..6e7824a 100755 --- a/tsp/jobs/trigger_downloads.sh +++ b/tsp/jobs/trigger_downloads.sh @@ -29,28 +29,25 @@ test -n "${TIMESTAMP}" || die "Missing argument: timestamp" mkdir -p "${WS_WATCHER}" cd "${WS_WATCHER}" -touch dispatched_urls -touch "${TIMESTAMP}" - -TREE="${TSP_DIR}/tree" -test -f "${TREE}" || TREE="${TSP_DIR}/tree.example" - -# Crawl given url -case "$URL" in - *unified*) - timeout 600 "${TSP_DIR}/scripts/crawler.py" --timestamp "${TIMESTAMP}" --tree "${TSP_DIR}/tree_unified" --log DEBUG "${URL}" || die "Crawl failed" "$?" - ;; - *) - timeout 600 "${TSP_DIR}/scripts/crawler.py" --timestamp "${TIMESTAMP}" --tree "${TREE}" --log DEBUG "${URL}" || die "Crawl failed" "$?" - ;; -esac - # Read next buid_nr touch next_dwn next=$(cat next_dwn) test -n "$next" || next=1 initial="$next" +# Crawl given url +DB="timestamp.db3" +TEMP_DB="$DB.$$" +cp "$DB" "$TEMP_DB" # Save database in the current state +test -d json || mkdir json +test -d log || mkdir log +timeout 1200 gorawler -res "json/$TIMESTAMP.$next.$$.json" -db "$TEMP_DB" -log "log/$TIMESTAMP.$next.$$.log" \ + -url "$URL" > modified_urls || { + rm -v "$TEMP_DB" + die "Crawl failed!" +} +mv "$TEMP_DB" "$DB" # Update database if it succeeded + # Allocate 300 numbers in case this script terminates during dispatch echo $((next+300)) > "${WS_WATCHER}/next_dwn" || die "Write failed: ${WS_WATCHER}/next_dwn" # Append to unique_dispatch in case previous job failed @@ -58,7 +55,7 @@ if [ -f manual_urls ]; then sort -u modified_urls manual_urls >> unique_dispatch else - cat modified_urls >> unique_dispatch + sort -u modified_urls >> unique_dispatch fi # Choose image_map |