summaryrefslogtreecommitdiff
path: root/tsp
diff options
context:
space:
mode:
Diffstat (limited to 'tsp')
-rwxr-xr-xtsp/jobs/trigger_downloads.sh31
1 files changed, 14 insertions, 17 deletions
diff --git a/tsp/jobs/trigger_downloads.sh b/tsp/jobs/trigger_downloads.sh
index d765e4a..6e7824a 100755
--- a/tsp/jobs/trigger_downloads.sh
+++ b/tsp/jobs/trigger_downloads.sh
@@ -29,28 +29,25 @@ test -n "${TIMESTAMP}" || die "Missing argument: timestamp"
mkdir -p "${WS_WATCHER}"
cd "${WS_WATCHER}"
-touch dispatched_urls
-touch "${TIMESTAMP}"
-
-TREE="${TSP_DIR}/tree"
-test -f "${TREE}" || TREE="${TSP_DIR}/tree.example"
-
-# Crawl given url
-case "$URL" in
- *unified*)
- timeout 600 "${TSP_DIR}/scripts/crawler.py" --timestamp "${TIMESTAMP}" --tree "${TSP_DIR}/tree_unified" --log DEBUG "${URL}" || die "Crawl failed" "$?"
- ;;
- *)
- timeout 600 "${TSP_DIR}/scripts/crawler.py" --timestamp "${TIMESTAMP}" --tree "${TREE}" --log DEBUG "${URL}" || die "Crawl failed" "$?"
- ;;
-esac
-
# Read next buid_nr
touch next_dwn
next=$(cat next_dwn)
test -n "$next" || next=1
initial="$next"
+# Crawl given url
+DB="timestamp.db3"
+TEMP_DB="$DB.$$"
+cp "$DB" "$TEMP_DB" # Save database in the current state
+test -d json || mkdir json
+test -d log || mkdir log
+timeout 1200 gorawler -res "json/$TIMESTAMP.$next.$$.json" -db "$TEMP_DB" -log "log/$TIMESTAMP.$next.$$.log" \
+ -url "$URL" > modified_urls || {
+ rm -v "$TEMP_DB"
+ die "Crawl failed!"
+}
+mv "$TEMP_DB" "$DB" # Update database if it succeeded
+
# Allocate 300 numbers in case this script terminates during dispatch
echo $((next+300)) > "${WS_WATCHER}/next_dwn" || die "Write failed: ${WS_WATCHER}/next_dwn"
# Append to unique_dispatch in case previous job failed
@@ -58,7 +55,7 @@ if [ -f manual_urls ];
then
sort -u modified_urls manual_urls >> unique_dispatch
else
- cat modified_urls >> unique_dispatch
+ sort -u modified_urls >> unique_dispatch
fi
# Choose image_map