84 lines
2.1 KiB
Bash
Executable file
84 lines
2.1 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
PAGEINDEX_PATH="data/pageindex/Files/"
|
|
|
|
PAGE_PATH="data/files/"
|
|
PAGE_NAME_PATH="data/pages_by_name/"
|
|
|
|
FORCE_FETCH="$1"
|
|
|
|
docker stop curl-impersonate_files; docker rm curl-impersonate_files
|
|
docker run -it -d --rm --name curl-impersonate_files curl-impersonate bash
|
|
|
|
doFullRequest () {
|
|
docker exec curl-impersonate_files /build/out/curl_ff95 --max-time 10 --silent --compressed "${1}"
|
|
}
|
|
|
|
doUrlEncode () {
|
|
python3 -c "import urllib.parse; print(urllib.parse.quote_plus('''${1}'''))"
|
|
}
|
|
|
|
isValidPageResult () {
|
|
if [[ $(grep '<html' "$1") == "" ]]; then
|
|
echo "yes"
|
|
return
|
|
fi
|
|
echo "no"
|
|
return
|
|
}
|
|
|
|
fetchFile () {
|
|
RESULT_PATH="${PAGE_PATH}${1}"
|
|
|
|
if [[ -f "${RESULT_PATH}" ]]; then
|
|
if [[ "$FORCE_FETCH" != "yes" ]]; then
|
|
echo "${RESULT_PATH}"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
doFullRequest "${2}" > .files.tmp
|
|
if [[ $(isValidPageResult .files.tmp) == "yes" ]]; then
|
|
mv .files.tmp "${RESULT_PATH}"
|
|
echo "${RESULT_PATH}"
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
for f in "${PAGEINDEX_PATH}"*.json; do
|
|
echo "working on page ${f}"
|
|
jq -c -r '.query.allimages[]' "$f" | while read -r item; do
|
|
pageid=$(jq -r '.descriptionshorturl' <<< "$item")
|
|
name=$(jq -r '.name' <<< "$item")
|
|
title=$(jq -r '.title' <<< "$item")
|
|
url=$(jq -r '.url' <<< "$item")
|
|
if [[ "${title}" != File:* ]]; then
|
|
continue
|
|
fi
|
|
ext="${name##*.}"
|
|
pageid="${pageid##*=}"
|
|
if [[ "${pageid}" == "0" ]]; then
|
|
echo "Fetch file name $name title $title extension $ext url $url"
|
|
RESULT_PATH=$(fetchFile "${name}" "${url}")
|
|
|
|
if [[ "${RESULT_PATH}" != "" ]]; then
|
|
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}"
|
|
rm "${linkname}" 2>/dev/null
|
|
ln -s "../files/${name}" "${linkname}"
|
|
fi
|
|
else
|
|
echo "Fetch file ${pageid} title $title extension $ext url $url"
|
|
RESULT_PATH=$(fetchFile "${pageid}.${ext}" "${url}")
|
|
|
|
if [[ "${RESULT_PATH}" != "" ]]; then
|
|
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}"
|
|
rm "${linkname}" 2>/dev/null
|
|
ln -s "../files/${pageid}.${ext}" "${linkname}"
|
|
fi
|
|
fi
|
|
done
|
|
done
|
|
|
|
docker stop curl-impersonate_files; docker rm curl-impersonate_files |