touhouwiki-mirror/files.sh
2022-02-20 03:48:30 +01:00

80 lines
1.8 KiB
Bash
Executable file

#!/bin/bash
PAGEINDEX_PATH="data/pageindex/"
PAGE_PATH="data/files/"
PAGE_NAME_PATH="data/pages_by_name/"
FORCE_FETCH="$2"
CATEGORY="${1//[ \/]/_}"
doRequest () {
doFullRequest "https://en.touhouwiki.net/${1}"
sleep 1
}
doFullRequest () {
docker run --rm curl-impersonate /build/out/curl_ff95 --max-time 10 --silent --compressed "${1}"
}
doUrlEncode () {
python -c "import urllib; print urllib.quote_plus('''${1}''')"
}
isValidPageResult () {
if [[ $(grep '<html' "$1") == "" ]]; then
echo "yes"
return
fi
echo "no"
return
}
fetchFileURL() {
doRequest "api.php?action=query&prop=imageinfo&iiprop=url&format=json&pageids=${1}" | jq -r ".query.pages."\""${1}"\"".imageinfo[0].url"
}
fetchFile () {
uri=$(fetchFileURL "${1}")
if [[ "${uri}" == "" ]]; then
return 1
fi
RESULT_PATH="${PAGE_PATH}${1}.${3}"
if [[ -f "${RESULT_PATH}" ]]; then
if [[ "$FORCE_FETCH" != "yes" ]]; then
echo "${RESULT_PATH}"
return 0
fi
fi
echo "${uri}" >&2
doFullRequest "${uri}" > .files.tmp
if [[ $(isValidPageResult .files.tmp) == "yes" ]]; then
mv .files.tmp "${RESULT_PATH}"
echo "${RESULT_PATH}"
return 0
fi
return 1
}
for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
echo "working on page ${f}"
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
pageid=$(jq -r '.pageid' <<< "$item")
title=$(jq -r '.title' <<< "$item")
if [[ "${title}" != File:* ]]; then
continue
fi
ext="${title##*.}"
echo "Fetch file ${pageid} title $title extension $ext"
RESULT_PATH=$(fetchFile "${pageid}" "${title}" "${ext}")
if [[ "${RESULT_PATH}" != "" ]]; then
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}"
rm "${linkname}" 2>/dev/null
ln -s "../files/${pageid}.${ext}" "${linkname}"
fi
done
done