Add file grabbing

This commit is contained in:
DataHoarder 2022-02-20 03:48:30 +01:00
parent ac25b88679
commit 730a4e60cb
4 changed files with 90 additions and 3 deletions

3
.gitattributes vendored Normal file
View file

@ -0,0 +1,3 @@
*.jpg filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.gif filter=lfs diff=lfs merge=lfs -text

5
all.sh
View file

@ -38,6 +38,11 @@ docker build -t curl-impersonate https://github.com/lwthiker/curl-impersonate.gi
./listing.sh "Lyrics"
./mirror.sh "Lyrics"
# Grab covers
./listing.sh "Album covers"
./files.sh "Album covers"
# Grab circles
./listing.sh "Doujin Circle/Arrangement"

79
files.sh Executable file
View file

@ -0,0 +1,79 @@
#!/bin/bash
PAGEINDEX_PATH="data/pageindex/"
PAGE_PATH="data/files/"
PAGE_NAME_PATH="data/pages_by_name/"
FORCE_FETCH="$2"
CATEGORY="${1//[ \/]/_}"
doRequest () {
doFullRequest "https://en.touhouwiki.net/${1}"
sleep 1
}
doFullRequest () {
docker run --rm curl-impersonate /build/out/curl_ff95 --max-time 10 --silent --compressed "${1}"
}
doUrlEncode () {
python -c "import urllib; print urllib.quote_plus('''${1}''')"
}
isValidPageResult () {
if [[ $(grep '<html' "$1") == "" ]]; then
echo "yes"
return
fi
echo "no"
return
}
fetchFileURL() {
doRequest "api.php?action=query&prop=imageinfo&iiprop=url&format=json&pageids=${1}" | jq -r ".query.pages."\""${1}"\"".imageinfo[0].url"
}
fetchFile () {
uri=$(fetchFileURL "${1}")
if [[ "${uri}" == "" ]]; then
return 1
fi
RESULT_PATH="${PAGE_PATH}${1}.${3}"
if [[ -f "${RESULT_PATH}" ]]; then
if [[ "$FORCE_FETCH" != "yes" ]]; then
echo "${RESULT_PATH}"
return 0
fi
fi
echo "${uri}" >&2
doFullRequest "${uri}" > .files.tmp
if [[ $(isValidPageResult .files.tmp) == "yes" ]]; then
mv .files.tmp "${RESULT_PATH}"
echo "${RESULT_PATH}"
return 0
fi
return 1
}
for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
echo "working on page ${f}"
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
pageid=$(jq -r '.pageid' <<< "$item")
title=$(jq -r '.title' <<< "$item")
if [[ "${title}" != File:* ]]; then
continue
fi
ext="${title##*.}"
echo "Fetch file ${pageid} title $title extension $ext"
RESULT_PATH=$(fetchFile "${pageid}" "${title}" "${ext}")
if [[ "${RESULT_PATH}" != "" ]]; then
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}"
rm "${linkname}" 2>/dev/null
ln -s "../files/${pageid}.${ext}" "${linkname}"
fi
done
done

View file

@ -22,7 +22,7 @@ doPageRequest () {
}
isValidPageResult () {
if [[ "cat $1" != "" && $(grep '<html' "$1") == "" ]]; then
if [[ $(cat "$1") != "" && $(grep '<html' "$1") == "" ]]; then
echo "yes"
return
fi
@ -55,7 +55,7 @@ for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
pageid=$(jq -r '.pageid' <<< "$item")
title=$(jq -r '.title' <<< "$item")
if [[ "${title}" == Category:* || "${title}" == User:* ]]; then
if [[ "${title}" == Category:* || "${title}" == User:* || "${title}" == File:* ]]; then
continue
fi
echo "Fetch page ${pageid} title $title"
@ -64,7 +64,7 @@ for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
if [[ "${RESULT_PATH}" != "" ]]; then
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}.wiki"
rm "${linkname}" 2>/dev/null
ln -s "../pages/${pageid}.wiki" "${PAGE_NAME_PATH}${title//[ \/]/_}.wiki"
ln -s "../pages/${pageid}.wiki" "${linkname}"
fi
done
done