Initial version

This commit is contained in:
DataHoarder 2022-02-18 14:38:09 +01:00
commit c3853e43f2
6 changed files with 208 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/.mirror.tmp.wiki
/.listing.tmp.json

36
README.md Normal file
View file

@ -0,0 +1,36 @@
# touhouwiki-mirror
Small collection of scripts to mirror Doujin CDs raw text and templates from [Touhou Wiki](https://en.touhouwiki.net/) for offline querying.
Also includes data fetched under [data/](data/).
# Usage
* `$ ./all.sh` to do a full run updating where necessary
* `$ ./mirror.sh "<category name>" force` if fetching all pages of a category is necessary.
# License
```
Copyright (c) 2022 WeebDataHoarder
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```
See the [README under data/](data/README.md) for more information about the Touhou Wiki respective content Copyrights and License

12
all.sh Executable file
View file

@ -0,0 +1,12 @@
#!/bin/bash
docker build -t curl-impersonate https://github.com/WeebDataHoarder/curl-impersonate.git
./listing.sh "Title abbreviation templates"
./mirror.sh "Title abbreviation templates"
./listing.sh "Infobox Templates"
./mirror.sh "Infobox Templates"
./listing.sh "Arrangement CDs"
./mirror.sh "Arrangement CDs"

8
data/README.md Normal file
View file

@ -0,0 +1,8 @@
# Source
Files and content under this directory have been obtained from the [Touhou Wiki](https://en.touhouwiki.net/)
# [Content License](https://en.touhouwiki.net/wiki/Touhou_Wiki:Copyrights#Content_licensing)
Text content of Touhou Wiki (under this directory, pages/templates/index) is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International license, a.k.a. CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
# [Copyright status](https://en.touhouwiki.net/wiki/Touhou_Wiki:Copyrights#Copyright_status.2FTerms_of_Use_of_the_Touhou_Project)
See [Touhou Wiki:Copyrights](https://en.touhouwiki.net/wiki/Touhou_Wiki:Copyrights#Copyright_status.2FTerms_of_Use_of_the_Touhou_Project) for more information.

84
listing.sh Executable file
View file

@ -0,0 +1,84 @@
#!/bin/bash
PAGE_PATH="data/pageindex/"
doRequest () {
docker run -it --rm curl-impersonate /build/out/curl_ff95 --silent --compressed "https://en.touhouwiki.net/${1}"
sleep 2
}
doUrlEncode () {
python -c "import urllib; print urllib.quote_plus('''${1}''')"
}
doCategoryRequest () {
CAT=$(doUrlEncode "$1")
CONT=$(doUrlEncode "$2")
if [[ "$CONT" != "" ]]; then
doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending&cmcontinue=${CONT}"
else
doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending"
fi
}
isValidQueryResult () {
if [[ $(jq -r '.query.categorymembers | length' "$1") -gt 0 ]]; then
echo "yes"
return
fi
echo "no"
return
}
getQueryResultCmContinue () {
jq -r '.continue.cmcontinue' "$1"
}
fetchCategoryRequestPage () {
RESULT_PATH="${PAGE_PATH}${2}/${1}.json"
doCategoryRequest "${2}" "${3}" > .listing.tmp.json
if [[ $(isValidQueryResult .listing.tmp.json) == "yes" ]]; then
mv .listing.tmp.json "${RESULT_PATH}"
echo "${RESULT_PATH}"
return 0
fi
return 1
}
CATEGORY="${1//[ \/]/_}"
PAGE=1
CMCONTINUE=""
mkdir -p "${PAGE_PATH}${CATEGORY}"
while [[ 1 ]]; do
NEXT_PAGE=$((PAGE+1))
echo "Fetch page $PAGE"
# Page does not exist, try fetch
if [[ ! -f "${PAGE_PATH}${CATEGORY}/${PAGE}.json" ]]; then
RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}")
if [[ "$RESULT_PATH" == "" ]]; then
break
fi
CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}")
# Is it last page? If so fetch
elif [[ ! -f "${PAGE_PATH}${CATEGORY}/${NEXT_PAGE}.json" ]]; then
RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}")
if [[ "$RESULT_PATH" == "" ]]; then
break
fi
CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}")
else
CMCONTINUE=$(getQueryResultCmContinue "${PAGE_PATH}${CATEGORY}/${PAGE}.json")
fi
if [[ "$CMCONTINUE" == "" || "$CMCONTINUE" == "null" ]]; then
break
fi
PAGE=$((PAGE+1))
done

66
mirror.sh Executable file
View file

@ -0,0 +1,66 @@
#!/bin/bash
PAGEINDEX_PATH="data/pageindex/"
PAGE_PATH="data/pages/"
TEMPLATE_PATH="data/templates/"
FORCE_FETCH="$2"
CATEGORY="${1//[ \/]/_}"
doRequest () {
docker run --rm curl-impersonate /build/out/curl_ff95 --silent --compressed "https://en.touhouwiki.net/${1}"
sleep 2
}
doUrlEncode () {
python -c "import urllib; print urllib.quote_plus('''${1}''')"
}
doPageRequest () {
doRequest "index.php?curid=${1}&action=raw"
}
isValidPageResult () {
if [[ "cat $1" != "" && $(grep '<html' "$1") == "" ]]; then
echo "yes"
return
fi
echo "no"
return
}
fetchPage () {
RESULT_PATH="${PAGE_PATH}${1}.wiki"
if [[ -f "${RESULT_PATH}" ]]; then
if [[ "$FORCE_FETCH" != "yes" ]]; then
echo "${RESULT_PATH}"
return 0
fi
fi
doPageRequest "${1}" > .mirror.tmp.wiki
if [[ $(isValidPageResult .mirror.tmp.wiki) == "yes" ]]; then
mv .mirror.tmp.wiki "${RESULT_PATH}"
echo "${RESULT_PATH}"
return 0
fi
return 1
}
for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
echo "working on page ${f}"
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
pageid=$(jq -r '.pageid' <<< "$item")
title=$(jq -r '.title' <<< "$item")
echo "Fetch page ${pageid} title $title"
RESULT_PATH=$(fetchPage "${pageid}" "${title}")
if [[ "${RESULT_PATH}" != "" && "${title}" == Template:* ]]; then
name=${title/Template:/}
cp -v "${RESULT_PATH}" "${TEMPLATE_PATH}${name//[ \/]/_}.wiki"
fi
done
done