touhouwiki-mirror/listing.sh

87 lines
2.3 KiB
Bash
Raw Permalink Normal View History

2022-02-18 13:38:09 +00:00
#!/bin/bash
PAGE_PATH="data/pageindex/"
doRequest () {
docker run -it --rm curl-impersonate /build/out/curl_ff95 --silent --compressed "https://en.touhouwiki.net/${1}"
2022-02-19 02:03:18 +00:00
sleep 1
2022-02-18 13:38:09 +00:00
}
doUrlEncode () {
2022-07-22 20:10:36 +00:00
python3 -c "import urllib.parse; print(urllib.parse.quote_plus('''${1}'''))"
2022-02-18 13:38:09 +00:00
}
doCategoryRequest () {
CAT=$(doUrlEncode "$1")
CONT=$(doUrlEncode "$2")
if [[ "$CONT" != "" ]]; then
doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending&cmcontinue=${CONT}"
else
doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending"
fi
}
isValidQueryResult () {
if [[ $(jq -r '.query.categorymembers | length' "$1") -gt 0 ]]; then
echo "yes"
return
fi
echo "no"
return
}
getQueryResultCmContinue () {
jq -r '.continue.cmcontinue' "$1"
}
fetchCategoryRequestPage () {
2022-02-19 02:17:42 +00:00
RESULT_PATH="${PAGE_PATH}${2//[\/]/_}/${1}.json"
2022-02-18 13:38:09 +00:00
doCategoryRequest "${2}" "${3}" > .listing.tmp.json
if [[ $(isValidQueryResult .listing.tmp.json) == "yes" ]]; then
2022-02-18 14:00:27 +00:00
jq . .listing.tmp.json > "${RESULT_PATH}"
2022-02-18 13:38:09 +00:00
echo "${RESULT_PATH}"
return 0
fi
2022-02-18 14:00:27 +00:00
rm .listing.tmp.json 2>/dev/null
2022-02-18 13:38:09 +00:00
return 1
}
2022-02-19 02:17:42 +00:00
CATEGORY="${1//[ ]/_}"
PATHCATEGORY="${CATEGORY//[\/]/_}"
2022-02-18 13:38:09 +00:00
PAGE=1
CMCONTINUE=""
2022-02-19 02:17:42 +00:00
mkdir -p "${PAGE_PATH}${PATHCATEGORY}"
2022-02-18 13:38:09 +00:00
while [[ 1 ]]; do
NEXT_PAGE=$((PAGE+1))
echo "Fetch page $PAGE"
# Page does not exist, try fetch
2022-02-19 02:17:42 +00:00
if [[ ! -f "${PAGE_PATH}${PATHCATEGORY}/${PAGE}.json" ]]; then
2022-02-18 13:38:09 +00:00
RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}")
if [[ "$RESULT_PATH" == "" ]]; then
break
fi
CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}")
# Is it last page? If so fetch
2022-02-19 02:17:42 +00:00
elif [[ ! -f "${PAGE_PATH}${PATHCATEGORY}/${NEXT_PAGE}.json" ]]; then
2022-02-18 13:38:09 +00:00
RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}")
if [[ "$RESULT_PATH" == "" ]]; then
break
fi
CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}")
else
2022-02-19 02:17:42 +00:00
CMCONTINUE=$(getQueryResultCmContinue "${PAGE_PATH}${PATHCATEGORY}/${PAGE}.json")
2022-02-18 13:38:09 +00:00
fi
if [[ "$CMCONTINUE" == "" || "$CMCONTINUE" == "null" ]]; then
break
fi
PAGE=$((PAGE+1))
done