forked from MapComplete/MapComplete
220 lines
8.6 KiB
TypeScript
220 lines
8.6 KiB
TypeScript
/**
|
|
* Script to download images from Wikimedia Commons, and save them together with license information.
|
|
*/
|
|
|
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs"
|
|
import { unescape } from "querystring"
|
|
import SmallLicense from "../src/Models/smallLicense"
|
|
import Wikimedia, { ImageQueryAPIResponse } from "../src/Logic/Web/Wikimedia"
|
|
|
|
interface CategoryMember {
|
|
pageid: number
|
|
ns: number
|
|
title: string
|
|
}
|
|
|
|
interface CategoryQueryAPIResponse {
|
|
batchcomplete: string
|
|
query: {
|
|
categorymembers: CategoryMember[]
|
|
}
|
|
}
|
|
|
|
interface ImagesQueryAPIResponse {
|
|
continue: {
|
|
imcontinue: string
|
|
continue: string
|
|
}
|
|
query: {
|
|
normalized?: {
|
|
from: string
|
|
to: string
|
|
}[]
|
|
pages: {
|
|
[key: string]: {
|
|
pageid: number
|
|
ns: number
|
|
title: string
|
|
images?: {
|
|
ns: number
|
|
title: string
|
|
}[]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function main(args: string[]) {
|
|
if (args.length < 2) {
|
|
console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ")
|
|
console.log(
|
|
"Example: npx vite-node scripts/downloadCommons.ts -- assets/svg https://commons.wikimedia.org/wiki/File:Example.jpg"
|
|
)
|
|
process.exit(1)
|
|
}
|
|
const [outputFolder, ...urls] = args
|
|
|
|
for (const url of urls) {
|
|
// Download details from the API
|
|
const commonsFileNamePath = url.split("/").pop()
|
|
if (commonsFileNamePath === undefined) {
|
|
console.log(
|
|
"\x1b[31m%s\x1b[0m",
|
|
`URL ${url} doesn't seem to be a valid URL! Skipping...`
|
|
)
|
|
continue
|
|
}
|
|
|
|
const commonsFileName = commonsFileNamePath.split("?").shift()
|
|
|
|
if (commonsFileName === undefined) {
|
|
console.log(
|
|
"\x1b[31m%s\x1b[0m",
|
|
`URL ${url} doesn't seem to contain a filename or category! Skipping...`
|
|
)
|
|
continue
|
|
}
|
|
|
|
console.log(`Processing ${commonsFileName}...`)
|
|
|
|
const baseUrl = url.split("/").slice(0, 3).join("/")
|
|
|
|
// Check if it is a file or a category
|
|
if (url.includes("Category:")) {
|
|
// Download all files in the category
|
|
const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&list=categorymembers&cmtitle=${commonsFileName}&cmlimit=250&cmtype=file`
|
|
const response = await fetch(apiUrl)
|
|
const apiDetails: CategoryQueryAPIResponse = await response.json()
|
|
for (const member of apiDetails.query.categorymembers) {
|
|
await downloadImage(member.title, outputFolder, baseUrl)
|
|
}
|
|
} else if (url.includes("File:")) {
|
|
await downloadImage(commonsFileName, outputFolder, baseUrl)
|
|
} else {
|
|
// Probably a page url, try to get all images from the page
|
|
const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=images&titles=${commonsFileName}&imlimit=250`
|
|
const response = await fetch(apiUrl)
|
|
const apiDetails: ImagesQueryAPIResponse = await response.json()
|
|
const page = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]]
|
|
if (page.images) {
|
|
for (const image of page.images) {
|
|
await downloadImage(image.title, outputFolder, baseUrl)
|
|
}
|
|
} else {
|
|
console.log(
|
|
"\x1b[31m%s\x1b[0m",
|
|
`URL ${url} doesn't seem to contain any images! Skipping...`
|
|
)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async function downloadImage(filename: string, outputFolder: string, baseUrl: string) {
|
|
const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url|extmetadata|user&iimetadataversion=latest&titles=${filename}`
|
|
const response = await fetch(apiUrl)
|
|
const apiDetails: ImageQueryAPIResponse = await response.json()
|
|
const missingPage = apiDetails.query.pages["-1"]
|
|
|
|
// Check if the local file already exists, if it does, skip it
|
|
if (existsSync(`${outputFolder}/${filename}`)) {
|
|
console.log(`\x1b[33m%s\x1b[0m`, `${filename} already exists, skipping...`)
|
|
return
|
|
}
|
|
|
|
// Check if the file exists, locally or externally
|
|
if (missingPage !== undefined) {
|
|
// Image does not exist locally, check if it exists externally
|
|
if (
|
|
apiDetails.query.pages["-1"].imagerepository !== "local" &&
|
|
apiDetails.query.pages["-1"].imagerepository !== ""
|
|
) {
|
|
// Check if we actually have image info
|
|
if (missingPage.imageinfo?.length !== undefined && missingPage.imageinfo.length > 0) {
|
|
const externalUrl = missingPage.imageinfo[0].descriptionurl
|
|
const externalBase = externalUrl.split("/").slice(0, 3).join("/")
|
|
|
|
const externalFilenamePath = externalUrl.split("/").pop()
|
|
if (externalFilenamePath !== undefined) {
|
|
const externalFilename = externalFilenamePath.split("?").shift()
|
|
console.log(
|
|
`\x1b[33m%s\x1b[0m`,
|
|
`${filename} is external, re-running with ${externalUrl}...`
|
|
)
|
|
if (externalFilename !== undefined) {
|
|
await downloadImage(externalFilename, outputFolder, externalBase)
|
|
return
|
|
} else {
|
|
// Edge case
|
|
console.log(
|
|
`\x1b[33m%s\x1b[0m`,
|
|
`External URL ${externalUrl} doesn't seem to contain a filename or category! Skipping...`
|
|
)
|
|
}
|
|
} else {
|
|
// Edge case
|
|
console.log(
|
|
`\x1b[33m%s\x1b[0m`,
|
|
`External URL ${externalUrl} doesn't seem to be a valid URL! Skipping...`
|
|
)
|
|
return
|
|
}
|
|
} else {
|
|
console.log(
|
|
`\x1b[33m%s\x1b[0m`,
|
|
`${filename} does not have image info!, skipping...`
|
|
)
|
|
}
|
|
}
|
|
console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not exist!, skipping...`)
|
|
} else {
|
|
// Harvest useful information
|
|
const wikiPage = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]]
|
|
|
|
// Check if we actually have image info
|
|
if (wikiPage.imageinfo?.length !== undefined && wikiPage.imageinfo.length > 0) {
|
|
// Check if the output folder exists
|
|
if (!existsSync(outputFolder)) {
|
|
const parts = outputFolder.split("/")
|
|
for (let i = 0; i < parts.length; i++) {
|
|
const part = parts.slice(0, i + 1).join("/")
|
|
if (!existsSync(part)) {
|
|
console.log(`Creating folder ${part}`)
|
|
mkdirSync(part)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Download the file and save it
|
|
const cleanFileName = unescape(filename).replace("File:", "")
|
|
const fileUrl = wikiPage.imageinfo[0].url
|
|
console.log(
|
|
`Downloading ${cleanFileName} from ${fileUrl} and saving it to ${outputFolder}/${cleanFileName}...`
|
|
)
|
|
const fileResponse = await fetch(fileUrl)
|
|
const fileBuffer = await fileResponse.arrayBuffer()
|
|
const file = Buffer.from(fileBuffer)
|
|
const filePath = `${outputFolder}/${cleanFileName}`
|
|
writeFileSync(filePath, file)
|
|
|
|
// Save the license information
|
|
const licenseInfo: SmallLicense = await Wikimedia.getLicenseFor(baseUrl, filename)
|
|
|
|
const licensePath = `${outputFolder}/license_info.json`
|
|
if (!existsSync(licensePath)) {
|
|
// Create the file if it doesn't exist
|
|
writeFileSync(licensePath, JSON.stringify([licenseInfo], null, 2))
|
|
} else {
|
|
// Append to the file if it does exist
|
|
const licenseFile = await readFileSync(licensePath, "utf8")
|
|
const licenseData = JSON.parse(licenseFile)
|
|
licenseData.push(licenseInfo)
|
|
writeFileSync(licensePath, JSON.stringify(licenseData, null, 2))
|
|
}
|
|
} else {
|
|
console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not have image info!, skipping...`)
|
|
}
|
|
}
|
|
}
|
|
|
|
// main(process.argv.slice(2))
|