#!/bin/bash PAGE_PATH="data/pageindex/" doRequest () { docker run -it --rm curl-impersonate /build/out/curl_ff95 --silent --compressed "https://en.touhouwiki.net/${1}" sleep 1 } doUrlEncode () { python3 -c "import urllib.parse; print(urllib.parse.quote_plus('''${1}'''))" } doCategoryRequest () { CAT=$(doUrlEncode "$1") CONT=$(doUrlEncode "$2") if [[ "$CONT" != "" ]]; then doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending&cmcontinue=${CONT}" else doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending" fi } isValidQueryResult () { if [[ $(jq -r '.query.categorymembers | length' "$1") -gt 0 ]]; then echo "yes" return fi echo "no" return } getQueryResultCmContinue () { jq -r '.continue.cmcontinue' "$1" } fetchCategoryRequestPage () { RESULT_PATH="${PAGE_PATH}${2//[\/]/_}/${1}.json" doCategoryRequest "${2}" "${3}" > .listing.tmp.json if [[ $(isValidQueryResult .listing.tmp.json) == "yes" ]]; then jq . .listing.tmp.json > "${RESULT_PATH}" echo "${RESULT_PATH}" return 0 fi rm .listing.tmp.json 2>/dev/null return 1 } CATEGORY="${1//[ ]/_}" PATHCATEGORY="${CATEGORY//[\/]/_}" PAGE=1 CMCONTINUE="" mkdir -p "${PAGE_PATH}${PATHCATEGORY}" while [[ 1 ]]; do NEXT_PAGE=$((PAGE+1)) echo "Fetch page $PAGE" # Page does not exist, try fetch if [[ ! -f "${PAGE_PATH}${PATHCATEGORY}/${PAGE}.json" ]]; then RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}") if [[ "$RESULT_PATH" == "" ]]; then break fi CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}") # Is it last page? If so fetch elif [[ ! -f "${PAGE_PATH}${PATHCATEGORY}/${NEXT_PAGE}.json" ]]; then RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}") if [[ "$RESULT_PATH" == "" ]]; then break fi CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}") else CMCONTINUE=$(getQueryResultCmContinue "${PAGE_PATH}${PATHCATEGORY}/${PAGE}.json") fi if [[ "$CMCONTINUE" == "" || "$CMCONTINUE" == "null" ]]; then break fi PAGE=$((PAGE+1)) done