Slighty rework commons download script

This commit is contained in:
Robin van der Linde 2023-05-24 12:34:51 +02:00
parent 24013c65e8
commit c6e70da598
Signed by untrusted user: Robin-van-der-Linde
GPG key ID: 53956B3252478F0D

View file

@ -101,28 +101,46 @@ const templateMapping = {
async function main(args: string[]) { async function main(args: string[]) {
if (args.length < 2) { if (args.length < 2) {
console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ") console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ")
return process.exit(1)
} }
const [outputFolder, ...urls] = args const [outputFolder, ...urls] = args
for (const url of urls) { for (const url of urls) {
// Download details from the API // Download details from the API
const commonsFileName = url.split("/").pop().split("?").shift() const commonsFileNamePath = url.split("/").pop()
console.log(`Processing ${commonsFileName}...`) if (commonsFileNamePath !== undefined) {
const commonsFileName = commonsFileNamePath.split("?").shift()
const baseUrl = url.split("/").slice(0, 3).join("/") if (commonsFileName !== undefined) {
console.log(`Processing ${commonsFileName}...`)
// Check if it is a file or a category const baseUrl = url.split("/").slice(0, 3).join("/")
if (url.includes("Category:")) {
// Download all files in the category // Check if it is a file or a category
const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&list=categorymembers&cmtitle=${commonsFileName}&cmlimit=250&cmtype=file` if (url.includes("Category:")) {
const response = await fetch(apiUrl) // Download all files in the category
const apiDetails: CategoryQueryAPIResponse = await response.json() const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&list=categorymembers&cmtitle=${commonsFileName}&cmlimit=250&cmtype=file`
for (const member of apiDetails.query.categorymembers) { const response = await fetch(apiUrl)
await downloadImage(member.title, outputFolder, baseUrl) const apiDetails: CategoryQueryAPIResponse = await response.json()
for (const member of apiDetails.query.categorymembers) {
await downloadImage(member.title, outputFolder, baseUrl)
}
} else {
await downloadImage(commonsFileName, outputFolder, baseUrl)
}
} else {
console.log(
"\x1b[31m%s\x1b[0m",
`URL ${url} doesn't seem to contain a filename or category! Skipping...`
)
continue
} }
} else { } else {
await downloadImage(commonsFileName, outputFolder, baseUrl) console.log(
"\x1b[31m%s\x1b[0m",
`URL ${url} doesn't seem to be a valid URL! Skipping...`
)
continue
} }
} }
} }
@ -131,94 +149,143 @@ async function downloadImage(filename: string, outputFolder: string, baseUrl: st
const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url|extmetadata|user&iimetadataversion=latest&titles=${filename}` const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url|extmetadata|user&iimetadataversion=latest&titles=${filename}`
const response = await fetch(apiUrl) const response = await fetch(apiUrl)
const apiDetails: ImageQueryAPIResponse = await response.json() const apiDetails: ImageQueryAPIResponse = await response.json()
const missingPage = apiDetails.query.pages["-1"]
// Check if the file exists, locally or externally // Check if the file exists, locally or externally
if (apiDetails.query.pages["-1"]) { if (missingPage !== undefined) {
// Image does not exist locally, check if it exists externally // Image does not exist locally, check if it exists externally
if (apiDetails.query.pages["-1"].imagerepository !== "local" && apiDetails.query.pages["-1"].imagerepository !== "") { if (
const externalUrl = apiDetails.query.pages["-1"].imageinfo[0].descriptionurl apiDetails.query.pages["-1"].imagerepository !== "local" &&
const externalBase = externalUrl.split("/").slice(0, 3).join("/") apiDetails.query.pages["-1"].imagerepository !== ""
const externalFilename = externalUrl.split("/").pop().split("?").shift() ) {
console.log(`\x1b[33m%s\x1b[0m`, `${filename} is external, re-running with ${externalUrl}...`) // Check if we actually have image info
await downloadImage(externalFilename, outputFolder, externalBase) if (missingPage.imageinfo?.length !== undefined && missingPage.imageinfo.length > 0) {
return const externalUrl = missingPage.imageinfo[0].descriptionurl
const externalBase = externalUrl.split("/").slice(0, 3).join("/")
const externalFilenamePath = externalUrl.split("/").pop()
if (externalFilenamePath !== undefined) {
const externalFilename = externalFilenamePath.split("?").shift()
console.log(
`\x1b[33m%s\x1b[0m`,
`${filename} is external, re-running with ${externalUrl}...`
)
if (externalFilename !== undefined) {
await downloadImage(externalFilename, outputFolder, externalBase)
return
} else {
// Edge case
console.log(
`\x1b[33m%s\x1b[0m`,
`External URL ${externalUrl} doesn't seem to contain a filename or category! Skipping...`
)
}
} else {
// Edge case
console.log(
`\x1b[33m%s\x1b[0m`,
`External URL ${externalUrl} doesn't seem to be a valid URL! Skipping...`
)
return
}
} else {
console.log(
`\x1b[33m%s\x1b[0m`,
`${filename} does not have image info!, skipping...`
)
}
} }
console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not exist!, skipping...`) console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not exist!, skipping...`)
} else { } else {
// Harvest useful information // Harvest useful information
const wikiPage = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]] const wikiPage = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]]
const wikiUrl = wikiPage.imageinfo[0].descriptionurl
const fileUrl = wikiPage.imageinfo[0].url
const author = wikiPage.imageinfo[0].extmetadata?.Artist?.value || wikiPage.imageinfo[0].user
let license = wikiPage.imageinfo[0].extmetadata?.LicenseShortName?.value || null
// Check if the output folder exists // Check if we actually have image info
if (!existsSync(outputFolder)) { if (wikiPage.imageinfo?.length !== undefined && wikiPage.imageinfo.length > 0) {
const parts = outputFolder.split("/") const wikiUrl = wikiPage.imageinfo[0].descriptionurl
for (var i = 0; i < parts.length; i++) { const fileUrl = wikiPage.imageinfo[0].url
const part = parts.slice(0, i + 1).join("/") const author =
if (!existsSync(part)) { wikiPage.imageinfo[0].extmetadata?.Artist?.value || wikiPage.imageinfo[0].user
console.log(`Creating folder ${part}`) let license = wikiPage.imageinfo[0].extmetadata?.LicenseShortName?.value || null
mkdirSync(part)
}
}
}
// Check if the license is present // Check if the output folder exists
if (!license) { if (!existsSync(outputFolder)) {
console.log(`${filename} does not have a license, falling back to checking template...`) const parts = outputFolder.split("/")
const templateUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=templates&titles=${filename}&tllimit=500` for (var i = 0; i < parts.length; i++) {
const templateResponse = await fetch(templateUrl) const part = parts.slice(0, i + 1).join("/")
const templateDetails: TemplateQueryAPIResponse = await templateResponse.json() if (!existsSync(part)) {
console.log(`Creating folder ${part}`)
// Loop through all templates and check if one of them is a license mkdirSync(part)
const wikiPage = templateDetails.query.pages[Object.keys(templateDetails.query.pages)[0]]
if (wikiPage.templates) {
for (const template of wikiPage.templates) {
if (templateMapping[template.title]) {
console.log(`Found license ${templateMapping[template.title]} for ${filename}`)
license = templateMapping[template.title]
} }
} }
} }
// If no license was found, skip the file // Check if the license is present
if (!license) { if (!license) {
// Log in yellow console.log(
console.log(`\x1b[33m%s\x1b[0m`, `No license found for ${filename}, skipping...`) `${filename} does not have a license, falling back to checking template...`
return )
const templateUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=templates&titles=${filename}&tllimit=500`
const templateResponse = await fetch(templateUrl)
const templateDetails: TemplateQueryAPIResponse = await templateResponse.json()
// Loop through all templates and check if one of them is a license
const wikiPage =
templateDetails.query.pages[Object.keys(templateDetails.query.pages)[0]]
if (wikiPage.templates) {
for (const template of wikiPage.templates) {
if (templateMapping[template.title]) {
console.log(
`Found license ${templateMapping[template.title]} for ${filename}`
)
license = templateMapping[template.title]
}
}
}
// If no license was found, skip the file
if (!license) {
// Log in yellow
console.log(
`\x1b[33m%s\x1b[0m`,
`No license found for ${filename}, skipping...`
)
return
}
} }
}
// Download the file and save it // Download the file and save it
const cleanFileName = unescape(filename).replace("File:", "") const cleanFileName = unescape(filename).replace("File:", "")
console.log( console.log(
`Downloading ${cleanFileName} from ${fileUrl} and saving it to ${outputFolder}/${cleanFileName}...` `Downloading ${cleanFileName} from ${fileUrl} and saving it to ${outputFolder}/${cleanFileName}...`
) )
const fileResponse = await fetch(fileUrl) const fileResponse = await fetch(fileUrl)
const fileBuffer = await fileResponse.arrayBuffer() const fileBuffer = await fileResponse.arrayBuffer()
const file = Buffer.from(fileBuffer) const file = Buffer.from(fileBuffer)
const filePath = `${outputFolder}/${cleanFileName}` const filePath = `${outputFolder}/${cleanFileName}`
writeFileSync(filePath, file) writeFileSync(filePath, file)
// Save the license information // Save the license information
const licenseInfo: SmallLicense = { const licenseInfo: SmallLicense = {
path: cleanFileName, path: cleanFileName,
license: licenseMapping[license] || license, license: licenseMapping[license] || license,
authors: [author], authors: [author],
sources: [wikiUrl], sources: [wikiUrl],
} }
const licensePath = `${outputFolder}/license_info.json` const licensePath = `${outputFolder}/license_info.json`
if (!existsSync(licensePath)) { if (!existsSync(licensePath)) {
// Create the file if it doesn't exist // Create the file if it doesn't exist
writeFileSync(licensePath, JSON.stringify([licenseInfo], null, 2)) writeFileSync(licensePath, JSON.stringify([licenseInfo], null, 2))
} else {
// Append to the file if it does exist
const licenseFile = await readFileSync(licensePath, "utf8")
const licenseData = JSON.parse(licenseFile)
licenseData.push(licenseInfo)
writeFileSync(licensePath, JSON.stringify(licenseData, null, 2))
}
} else { } else {
// Append to the file if it does exist console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not have image info!, skipping...`)
const licenseFile = await readFileSync(licensePath, "utf8")
const licenseData = JSON.parse(licenseFile)
licenseData.push(licenseInfo)
writeFileSync(licensePath, JSON.stringify(licenseData, null, 2))
} }
} }
} }