Add file grabbing
This commit is contained in:
parent
ac25b88679
commit
730a4e60cb
3
.gitattributes
vendored
Normal file
3
.gitattributes
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
*.jpg filter=lfs diff=lfs merge=lfs -text
|
||||
*.png filter=lfs diff=lfs merge=lfs -text
|
||||
*.gif filter=lfs diff=lfs merge=lfs -text
|
5
all.sh
5
all.sh
|
@ -38,6 +38,11 @@ docker build -t curl-impersonate https://github.com/lwthiker/curl-impersonate.gi
|
|||
./listing.sh "Lyrics"
|
||||
./mirror.sh "Lyrics"
|
||||
|
||||
# Grab covers
|
||||
|
||||
./listing.sh "Album covers"
|
||||
./files.sh "Album covers"
|
||||
|
||||
# Grab circles
|
||||
|
||||
./listing.sh "Doujin Circle/Arrangement"
|
||||
|
|
79
files.sh
Executable file
79
files.sh
Executable file
|
@ -0,0 +1,79 @@
|
|||
#!/bin/bash
|
||||
|
||||
PAGEINDEX_PATH="data/pageindex/"
|
||||
|
||||
PAGE_PATH="data/files/"
|
||||
PAGE_NAME_PATH="data/pages_by_name/"
|
||||
|
||||
FORCE_FETCH="$2"
|
||||
CATEGORY="${1//[ \/]/_}"
|
||||
|
||||
doRequest () {
|
||||
doFullRequest "https://en.touhouwiki.net/${1}"
|
||||
sleep 1
|
||||
}
|
||||
doFullRequest () {
|
||||
docker run --rm curl-impersonate /build/out/curl_ff95 --max-time 10 --silent --compressed "${1}"
|
||||
}
|
||||
|
||||
doUrlEncode () {
|
||||
python -c "import urllib; print urllib.quote_plus('''${1}''')"
|
||||
}
|
||||
|
||||
isValidPageResult () {
|
||||
if [[ $(grep '<html' "$1") == "" ]]; then
|
||||
echo "yes"
|
||||
return
|
||||
fi
|
||||
echo "no"
|
||||
return
|
||||
}
|
||||
|
||||
fetchFileURL() {
|
||||
doRequest "api.php?action=query&prop=imageinfo&iiprop=url&format=json&pageids=${1}" | jq -r ".query.pages."\""${1}"\"".imageinfo[0].url"
|
||||
}
|
||||
|
||||
fetchFile () {
|
||||
uri=$(fetchFileURL "${1}")
|
||||
if [[ "${uri}" == "" ]]; then
|
||||
return 1
|
||||
fi
|
||||
RESULT_PATH="${PAGE_PATH}${1}.${3}"
|
||||
|
||||
if [[ -f "${RESULT_PATH}" ]]; then
|
||||
if [[ "$FORCE_FETCH" != "yes" ]]; then
|
||||
echo "${RESULT_PATH}"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "${uri}" >&2
|
||||
doFullRequest "${uri}" > .files.tmp
|
||||
if [[ $(isValidPageResult .files.tmp) == "yes" ]]; then
|
||||
mv .files.tmp "${RESULT_PATH}"
|
||||
echo "${RESULT_PATH}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
|
||||
echo "working on page ${f}"
|
||||
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
|
||||
pageid=$(jq -r '.pageid' <<< "$item")
|
||||
title=$(jq -r '.title' <<< "$item")
|
||||
if [[ "${title}" != File:* ]]; then
|
||||
continue
|
||||
fi
|
||||
ext="${title##*.}"
|
||||
echo "Fetch file ${pageid} title $title extension $ext"
|
||||
RESULT_PATH=$(fetchFile "${pageid}" "${title}" "${ext}")
|
||||
|
||||
if [[ "${RESULT_PATH}" != "" ]]; then
|
||||
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}"
|
||||
rm "${linkname}" 2>/dev/null
|
||||
ln -s "../files/${pageid}.${ext}" "${linkname}"
|
||||
fi
|
||||
done
|
||||
done
|
|
@ -22,7 +22,7 @@ doPageRequest () {
|
|||
}
|
||||
|
||||
isValidPageResult () {
|
||||
if [[ "cat $1" != "" && $(grep '<html' "$1") == "" ]]; then
|
||||
if [[ $(cat "$1") != "" && $(grep '<html' "$1") == "" ]]; then
|
||||
echo "yes"
|
||||
return
|
||||
fi
|
||||
|
@ -55,7 +55,7 @@ for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
|
|||
jq -c -r '.query.categorymembers[]' "$f" | while read -r item; do
|
||||
pageid=$(jq -r '.pageid' <<< "$item")
|
||||
title=$(jq -r '.title' <<< "$item")
|
||||
if [[ "${title}" == Category:* || "${title}" == User:* ]]; then
|
||||
if [[ "${title}" == Category:* || "${title}" == User:* || "${title}" == File:* ]]; then
|
||||
continue
|
||||
fi
|
||||
echo "Fetch page ${pageid} title $title"
|
||||
|
@ -64,7 +64,7 @@ for f in "${PAGEINDEX_PATH}${CATEGORY}/"*.json; do
|
|||
if [[ "${RESULT_PATH}" != "" ]]; then
|
||||
linkname="${PAGE_NAME_PATH}${title//[ \/]/_}.wiki"
|
||||
rm "${linkname}" 2>/dev/null
|
||||
ln -s "../pages/${pageid}.wiki" "${PAGE_NAME_PATH}${title//[ \/]/_}.wiki"
|
||||
ln -s "../pages/${pageid}.wiki" "${linkname}"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
|
Loading…
Reference in a new issue