Initial version
This commit is contained in:
commit
c3853e43f2
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
/.mirror.tmp.wiki
|
||||
/.listing.tmp.json
|
36
README.md
Normal file
36
README.md
Normal file
|
@ -0,0 +1,36 @@
|
|||
# touhouwiki-mirror
|
||||
|
||||
Small collection of scripts to mirror Doujin CDs raw text and templates from [Touhou Wiki](https://en.touhouwiki.net/) for offline querying.
|
||||
|
||||
Also includes data fetched under [data/](data/).
|
||||
|
||||
# Usage
|
||||
* `$ ./all.sh` to do a full run updating where necessary
|
||||
* `$ ./mirror.sh "<category name>" force` if fetching all pages of a category is necessary.
|
||||
|
||||
# License
|
||||
|
||||
|
||||
```
|
||||
Copyright (c) 2022 WeebDataHoarder
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
```
|
||||
|
||||
See the [README under data/](data/README.md) for more information about the Touhou Wiki respective content Copyrights and License
|
12
all.sh
Executable file
12
all.sh
Executable file
|
@ -0,0 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
docker build -t curl-impersonate https://github.com/WeebDataHoarder/curl-impersonate.git
|
||||
|
||||
./listing.sh "Title abbreviation templates"
|
||||
./mirror.sh "Title abbreviation templates"
|
||||
|
||||
./listing.sh "Infobox Templates"
|
||||
./mirror.sh "Infobox Templates"
|
||||
|
||||
./listing.sh "Arrangement CDs"
|
||||
./mirror.sh "Arrangement CDs"
|
8
data/README.md
Normal file
8
data/README.md
Normal file
|
@ -0,0 +1,8 @@
|
|||
# Source
|
||||
Files and content under this directory have been obtained from the [Touhou Wiki](https://en.touhouwiki.net/)
|
||||
|
||||
# [Content License](https://en.touhouwiki.net/wiki/Touhou_Wiki:Copyrights#Content_licensing)
|
||||
Text content of Touhou Wiki (under this directory, pages/templates/index) is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International license, a.k.a. CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
|
||||
|
||||
# [Copyright status](https://en.touhouwiki.net/wiki/Touhou_Wiki:Copyrights#Copyright_status.2FTerms_of_Use_of_the_Touhou_Project)
|
||||
See [Touhou Wiki:Copyrights](https://en.touhouwiki.net/wiki/Touhou_Wiki:Copyrights#Copyright_status.2FTerms_of_Use_of_the_Touhou_Project) for more information.
|
84
listing.sh
Executable file
84
listing.sh
Executable file
|
@ -0,0 +1,84 @@
|
|||
#!/bin/bash
|
||||
|
||||
PAGE_PATH="data/pageindex/"
|
||||
|
||||
doRequest () {
|
||||
docker run -it --rm curl-impersonate /build/out/curl_ff95 --silent --compressed "https://en.touhouwiki.net/${1}"
|
||||
sleep 2
|
||||
}
|
||||
|
||||
doUrlEncode () {
|
||||
python -c "import urllib; print urllib.quote_plus('''${1}''')"
|
||||
}
|
||||
|
||||
doCategoryRequest () {
|
||||
CAT=$(doUrlEncode "$1")
|
||||
CONT=$(doUrlEncode "$2")
|
||||
if [[ "$CONT" != "" ]]; then
|
||||
doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending&cmcontinue=${CONT}"
|
||||
else
|
||||
doRequest "api.php?format=json&action=query&list=categorymembers&cmtitle=Category:${CAT}&cmtype=page&cmprop=ids|title|timestamp&cmsort=timestamp&cmlimit=500&cmdir=ascending"
|
||||
fi
|
||||
}
|
||||
|
||||
isValidQueryResult () {
|
||||
if [[ $(jq -r '.query.categorymembers | length' "$1") -gt 0 ]]; then
|
||||
echo "yes"
|
||||
return
|
||||
fi
|
||||
echo "no"
|
||||
return
|
||||
}
|
||||
|
||||
getQueryResultCmContinue () {
|
||||
jq -r '.continue.cmcontinue' "$1"
|
||||
}
|
||||
|
||||
fetchCategoryRequestPage () {
|
||||
RESULT_PATH="${PAGE_PATH}${2}/${1}.json"
|
||||
doCategoryRequest "${2}" "${3}" > .listing.tmp.json
|
||||
if [[ $(isValidQueryResult .listing.tmp.json) == "yes" ]]; then
|
||||
mv .listing.tmp.json "${RESULT_PATH}"
|
||||
echo "${RESULT_PATH}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
CATEGORY="${1//[ \/]/_}"
|
||||
PAGE=1
|
||||
CMCONTINUE=""
|
||||
|
||||
mkdir -p "${PAGE_PATH}${CATEGORY}"
|
||||
|
||||
while [[ 1 ]]; do
|
||||
|
||||
NEXT_PAGE=$((PAGE+1))
|
||||
|
||||
echo "Fetch page $PAGE"
|
||||
|
||||
# Page does not exist, try fetch
|
||||
if [[ ! -f "${PAGE_PATH}${CATEGORY}/${PAGE}.json" ]]; then
|
||||
RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}")
|
||||
if [[ "$RESULT_PATH" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}")
|
||||
# Is it last page? If so fetch
|
||||
elif [[ ! -f "${PAGE_PATH}${CATEGORY}/${NEXT_PAGE}.json" ]]; then
|
||||
RESULT_PATH=$(fetchCategoryRequestPage "${PAGE}" "${CATEGORY}" "${CMCONTINUE}")
|
||||
if [[ "$RESULT_PATH" == "" ]]; then
|
||||
break
|
||||
fi
|
||||
CMCONTINUE=$(getQueryResultCmContinue "${RESULT_PATH}")
|
||||
else
|
||||
CMCONTINUE=$(getQueryResultCmContinue "${PAGE_PATH}${CATEGORY}/${PAGE}.json")
|
||||
fi
|
||||
|
||||
if [[ "$CMCONTINUE" == "" || "$CMCONTINUE" == "null" ]]; then
|
||||
break
|
||||
fi
|
||||
|
||||
PAGE=$((PAGE+1))
|
||||
done
|
66
mirror.sh
Executable file
66
mirror.sh
Executable file
|
@ -0,0 +1,66 @@
|
|||
#!/bin/bash
|
||||
|
||||
PAGEINDEX_PATH="data/pageindex/"
|
||||
|
||||
PAGE_PATH="data/pages/"
|
||||
TEMPLATE_PATH="data/templates/"
|
||||
|
||||
FORCE_FETCH="$2"
|
||||
CATEGORY="${1//[ \/]/_}"
|
||||
|
||||
doRequest () {
|
||||
docker run --rm curl-impersonate /build/out/curl_ff95 --silent --compressed "https://en.touhouwiki.net/${1}"
|
||||
sleep 2
|
||||
}
|
||||
|
||||
doUrlEncode () {
|
||||
python -c "import urllib; print urllib.quote_plus('''${1}''')"
|
||||
}
|
||||
|
||||
doPageRequest () {
|
||||
doRequest "index.php?curid=${1}&action=raw"
|
||||
}
|
||||
|
||||
isValidPageResult () {
|
||||
if [[ "cat $1" != "" && $(grep '<html' "$1") == "" ]]; then
|
||||
echo "yes"
|
||||
return
|
||||
fi
|
||||
echo "no"
|
||||
return
|
||||
}
|
||||
|
||||
fetchPage () {
|
||||
RESULT_PATH="${PAGE_PATH}${1}.wiki"
|
||||
|
||||
if [[ -f "${RESULT_PATH}" ]]; then
|
||||
if [[ "$FORCE_FETCH" != "yes" ]]; then
|
||||
echo "${RESULT_PATH}"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
doPageRequest "${1}" > .mirror.tmp.wiki
|
||||
if [[ $(isValidPageResult .mirror.tmp.wiki) == "yes" ]]; then
|
||||
mv .mirror.tmp.wiki "${RESULT_PATH}"
|
||||
echo "${RESULT_PATH}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
|
||||
echo "working on page ${f}"
|
||||
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
|
||||
pageid=$(jq -r '.pageid' <<< "$item")
|
||||
title=$(jq -r '.title' <<< "$item")
|
||||
echo "Fetch page ${pageid} title $title"
|
||||
RESULT_PATH=$(fetchPage "${pageid}" "${title}")
|
||||
|
||||
if [[ "${RESULT_PATH}" != "" && "${title}" == Template:* ]]; then
|
||||
name=${title/Template:/}
|
||||
cp -v "${RESULT_PATH}" "${TEMPLATE_PATH}${name//[ \/]/_}.wiki"
|
||||
fi
|
||||
done
|
||||
done
|
Loading…
Reference in a new issue