Add support for other wikis, external files and more

- templates
- coloured logging
This commit is contained in:
Robin van der Linde 2023-01-04 15:53:50 +01:00
parent 4b5f01c74f
commit e0447c3428
Signed by untrusted user: Robin-van-der-Linde
GPG key ID: 53956B3252478F0D

View file

@ -28,26 +28,27 @@ interface ImageQueryAPIResponse {
ns: number ns: number
title: string title: string
imagerepository: string imagerepository: string
imageinfo: { imageinfo?: {
user: string
url: string url: string
descriptionurl: string descriptionurl: string
descriptionshorturl: string descriptionshorturl: string
extmetadata: { extmetadata?: {
DateTime: ExtMetadataProp DateTime: ExtMetadataProp
ObjectName: ExtMetadataProp ObjectName: ExtMetadataProp
CommonsMetadataExtension: ExtMetadataProp CommonsMetadataExtension?: ExtMetadataProp
Categories: ExtMetadataProp Categories?: ExtMetadataProp
Assessments: ExtMetadataProp Assessments?: ExtMetadataProp
ImageDescription: ExtMetadataProp ImageDescription?: ExtMetadataProp
DateTimeOriginal: ExtMetadataProp DateTimeOriginal?: ExtMetadataProp
Credit: ExtMetadataProp Credit?: ExtMetadataProp
Artist: ExtMetadataProp Artist?: ExtMetadataProp
LicenseShortName: ExtMetadataProp LicenseShortName?: ExtMetadataProp
UsageTerms: ExtMetadataProp UsageTerms?: ExtMetadataProp
AttributionRequired: ExtMetadataProp AttributionRequired?: ExtMetadataProp
Copyrighted: ExtMetadataProp Copyrighted?: ExtMetadataProp
Restrictions: ExtMetadataProp Restrictions?: ExtMetadataProp
License: ExtMetadataProp License?: ExtMetadataProp
} }
}[] }[]
} }
@ -68,9 +69,35 @@ interface CategoryQueryAPIResponse {
} }
} }
interface TemplateQueryAPIResponse {
batchcomplete: string
query: {
normalized?: {
from: string
to: string
}[]
pages: {
[key: string]: {
pageid: number
ns: number
title: string
templates?: {
ns: number
title: string
}[]
}
}
}
}
// Map license names of Wikimedia Commons to different names // Map license names of Wikimedia Commons to different names
const licenseMapping = {} const licenseMapping = {}
// Map template names to license names
const templateMapping = {
"Template:PD": "Public Domain",
}
async function main(args: string[]) { async function main(args: string[]) {
if (args.length < 2) { if (args.length < 2) {
console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ") console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ")
@ -83,74 +110,116 @@ async function main(args: string[]) {
const commonsFileName = url.split("/").pop().split("?").shift() const commonsFileName = url.split("/").pop().split("?").shift()
console.log(`Processing ${commonsFileName}...`) console.log(`Processing ${commonsFileName}...`)
const baseUrl = url.split("/").slice(0, 3).join("/")
// Check if it is a file or a category // Check if it is a file or a category
if (url.includes("Category:")) { if (url.includes("Category:")) {
// Download all files in the category // Download all files in the category
const apiUrl = `https://commons.wikimedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=${commonsFileName}&cmlimit=250&cmtype=file` const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&list=categorymembers&cmtitle=${commonsFileName}&cmlimit=250&cmtype=file`
const response = await fetch(apiUrl) const response = await fetch(apiUrl)
const apiDetails: CategoryQueryAPIResponse = await response.json() const apiDetails: CategoryQueryAPIResponse = await response.json()
for (const member of apiDetails.query.categorymembers) { for (const member of apiDetails.query.categorymembers) {
await downloadImage(member.title, outputFolder) await downloadImage(member.title, outputFolder, baseUrl)
} }
} else { } else {
await downloadImage(commonsFileName, outputFolder) await downloadImage(commonsFileName, outputFolder, baseUrl)
} }
} }
} }
async function downloadImage(filename: string, outputFolder: string) { async function downloadImage(filename: string, outputFolder: string, baseUrl: string) {
const apiUrl = `https://commons.wikimedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url|extmetadata&titles=${filename}` const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url|extmetadata|user&iimetadataversion=latest&titles=${filename}`
const response = await fetch(apiUrl) const response = await fetch(apiUrl)
const apiDetails: ImageQueryAPIResponse = await response.json() const apiDetails: ImageQueryAPIResponse = await response.json()
// Harvest useful information // Check if the file exists, locally or externally
const wikiPage = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]] if (apiDetails.query.pages["-1"]) {
const wikiUrl = wikiPage.imageinfo[0].descriptionurl // Image does not exist locally, check if it exists externally
const fileUrl = wikiPage.imageinfo[0].url if (apiDetails.query.pages["-1"].imagerepository !== "local" && apiDetails.query.pages["-1"].imagerepository !== "") {
const author = wikiPage.imageinfo[0].extmetadata.Artist.value const externalUrl = apiDetails.query.pages["-1"].imageinfo[0].descriptionurl
const license = wikiPage.imageinfo[0].extmetadata.LicenseShortName.value const externalBase = externalUrl.split("/").slice(0, 3).join("/")
const externalFilename = externalUrl.split("/").pop().split("?").shift()
console.log(`\x1b[33m%s\x1b[0m`, `${filename} is external, re-running with ${externalUrl}...`)
await downloadImage(externalFilename, outputFolder, externalBase)
return
}
console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not exist!, skipping...`)
} else {
// Harvest useful information
const wikiPage = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]]
const wikiUrl = wikiPage.imageinfo[0].descriptionurl
const fileUrl = wikiPage.imageinfo[0].url
const author = wikiPage.imageinfo[0].extmetadata?.Artist?.value || wikiPage.imageinfo[0].user
let license = wikiPage.imageinfo[0].extmetadata?.LicenseShortName?.value || null
// Check if the output folder exists // Check if the output folder exists
if (!existsSync(outputFolder)) { if (!existsSync(outputFolder)) {
const parts = outputFolder.split("/") const parts = outputFolder.split("/")
for (var i = 0; i < parts.length; i++) { for (var i = 0; i < parts.length; i++) {
const part = parts.slice(0, i + 1).join("/") const part = parts.slice(0, i + 1).join("/")
if (!existsSync(part)) { if (!existsSync(part)) {
console.log(`Creating folder ${part}`) console.log(`Creating folder ${part}`)
mkdirSync(part) mkdirSync(part)
}
} }
} }
}
// Download the file and save it // Check if the license is present
const cleanFileName = unescape(filename).replace("File:", "") if (!license) {
console.log( console.log(`${filename} does not have a license, falling back to checking template...`)
`Downloading ${cleanFileName} from ${fileUrl} and saving it to ${outputFolder}/${cleanFileName}...` const templateUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=templates&titles=${filename}&tllimit=500`
) const templateResponse = await fetch(templateUrl)
const fileResponse = await fetch(fileUrl) const templateDetails: TemplateQueryAPIResponse = await templateResponse.json()
const fileBuffer = await fileResponse.arrayBuffer()
const file = Buffer.from(fileBuffer)
const filePath = `${outputFolder}/${cleanFileName}`
writeFileSync(filePath, file)
// Save the license information // Loop through all templates and check if one of them is a license
const licenseInfo: SmallLicense = { const wikiPage = templateDetails.query.pages[Object.keys(templateDetails.query.pages)[0]]
path: cleanFileName, if (wikiPage.templates) {
license: licenseMapping[license] || license, for (const template of wikiPage.templates) {
authors: [author], if (templateMapping[template.title]) {
sources: [wikiUrl], console.log(`Found license ${templateMapping[template.title]} for ${filename}`)
} license = templateMapping[template.title]
}
}
}
const licensePath = `${outputFolder}/license_info.json` // If no license was found, skip the file
if (!existsSync(licensePath)) { if (!license) {
// Create the file if it doesn't exist // Log in yellow
writeFileSync(licensePath, JSON.stringify([licenseInfo], null, 2)) console.log(`\x1b[33m%s\x1b[0m`, `No license found for ${filename}, skipping...`)
} else { return
// Append to the file if it does exist }
const licenseFile = await readFileSync(licensePath, "utf8") }
const licenseData = JSON.parse(licenseFile)
licenseData.push(licenseInfo) // Download the file and save it
writeFileSync(licensePath, JSON.stringify(licenseData, null, 2)) const cleanFileName = unescape(filename).replace("File:", "")
console.log(
`Downloading ${cleanFileName} from ${fileUrl} and saving it to ${outputFolder}/${cleanFileName}...`
)
const fileResponse = await fetch(fileUrl)
const fileBuffer = await fileResponse.arrayBuffer()
const file = Buffer.from(fileBuffer)
const filePath = `${outputFolder}/${cleanFileName}`
writeFileSync(filePath, file)
// Save the license information
const licenseInfo: SmallLicense = {
path: cleanFileName,
license: licenseMapping[license] || license,
authors: [author],
sources: [wikiUrl],
}
const licensePath = `${outputFolder}/license_info.json`
if (!existsSync(licensePath)) {
// Create the file if it doesn't exist
writeFileSync(licensePath, JSON.stringify([licenseInfo], null, 2))
} else {
// Append to the file if it does exist
const licenseFile = await readFileSync(licensePath, "utf8")
const licenseData = JSON.parse(licenseFile)
licenseData.push(licenseInfo)
writeFileSync(licensePath, JSON.stringify(licenseData, null, 2))
}
} }
} }