touhouwiki-mirror/mirror.sh

71 lines
1.6 KiB
Bash
Raw Permalink Normal View History

2022-02-18 13:38:09 +00:00
#!/bin/bash
PAGEINDEX_PATH="data/pageindex/"
PAGE_PATH="data/pages/"
PAGE_NAME_PATH="data/pages_by_name/"
2022-02-18 13:38:09 +00:00
FORCE_FETCH="$2"
CATEGORY="${1//[ \/]/_}"
doRequest () {
2022-02-19 02:28:38 +00:00
docker run --rm curl-impersonate /build/out/curl_ff95 --max-time 10 --silent --compressed "https://en.touhouwiki.net/${1}"
2022-02-19 02:03:18 +00:00
sleep 1
2022-02-18 13:38:09 +00:00
}
doUrlEncode () {
2022-07-22 20:10:36 +00:00
python3 -c "import urllib.parse; print(urllib.parse.quote_plus('''${1}'''))"
2022-02-18 13:38:09 +00:00
}
doPageRequest () {
doRequest "index.php?curid=${1}&action=raw"
}
isValidPageResult () {
2022-02-20 02:48:30 +00:00
if [[ $(cat "$1") != "" && $(grep '<html' "$1") == "" ]]; then
2022-02-18 13:38:09 +00:00
echo "yes"
return
fi
echo "no"
return
}
fetchPage () {
RESULT_PATH="${PAGE_PATH}${1}.wiki"
if [[ -f "${RESULT_PATH}" ]]; then
if [[ "$FORCE_FETCH" != "yes" ]]; then
echo "${RESULT_PATH}"
return 0
fi
fi
doPageRequest "${1}" > .mirror.tmp.wiki
if [[ $(isValidPageResult .mirror.tmp.wiki) == "yes" ]]; then
mv .mirror.tmp.wiki "${RESULT_PATH}"
echo "${RESULT_PATH}"
return 0
fi
return 1
}
for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
echo "working on page ${f}"
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
pageid=$(jq -r '.pageid' <<< "$item")
title=$(jq -r '.title' <<< "$item")
2022-02-20 02:48:30 +00:00
if [[ "${title}" == Category:* || "${title}" == User:* || "${title}" == File:* ]]; then
2022-02-18 14:00:27 +00:00
continue
fi
2022-02-18 13:38:09 +00:00
echo "Fetch page ${pageid} title $title"
RESULT_PATH=$(fetchPage "${pageid}" "${title}")
if [[ "${RESULT_PATH}" != "" ]]; then
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}.wiki"
rm "${linkname}" 2>/dev/null
2022-02-20 02:48:30 +00:00
ln -s "../pages/${pageid}.wiki" "${linkname}"
2022-02-18 13:38:09 +00:00
fi
done
done