forked from MapComplete/MapComplete
		
	Commons download script
This commit is contained in:
		
							parent
							
								
									7d2fa876e3
								
							
						
					
					
						commit
						ff481d73e9
					
				
					 1 changed files with 296 additions and 0 deletions
				
			
		
							
								
								
									
										296
									
								
								scripts/downloadCommons.ts
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										296
									
								
								scripts/downloadCommons.ts
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,296 @@ | |||
| /** | ||||
|  * Script to download images from Wikimedia Commons, and save them together with license information. | ||||
|  */ | ||||
| 
 | ||||
| import { readFileSync, writeFileSync, existsSync, mkdirSync } from "fs" | ||||
| import { unescape } from "querystring" | ||||
| import SmallLicense from "../src/Models/smallLicense" | ||||
| 
 | ||||
| interface ExtMetadataProp { | ||||
|     value: string | ||||
|     source: string | ||||
|     hidden: string | ||||
| } | ||||
| 
 | ||||
| interface ImageQueryAPIResponse { | ||||
|     continue: { | ||||
|         iistart: string | ||||
|         continue: string | ||||
|     } | ||||
|     query: { | ||||
|         normalized?: { | ||||
|             from: string | ||||
|             to: string | ||||
|         }[] | ||||
|         pages: { | ||||
|             [key: string]: { | ||||
|                 pageid: number | ||||
|                 ns: number | ||||
|                 title: string | ||||
|                 imagerepository: string | ||||
|                 imageinfo?: { | ||||
|                     user: string | ||||
|                     url: string | ||||
|                     descriptionurl: string | ||||
|                     descriptionshorturl: string | ||||
|                     extmetadata?: { | ||||
|                         DateTime: ExtMetadataProp | ||||
|                         ObjectName: ExtMetadataProp | ||||
|                         CommonsMetadataExtension?: ExtMetadataProp | ||||
|                         Categories?: ExtMetadataProp | ||||
|                         Assessments?: ExtMetadataProp | ||||
|                         ImageDescription?: ExtMetadataProp | ||||
|                         DateTimeOriginal?: ExtMetadataProp | ||||
|                         Credit?: ExtMetadataProp | ||||
|                         Artist?: ExtMetadataProp | ||||
|                         LicenseShortName?: ExtMetadataProp | ||||
|                         UsageTerms?: ExtMetadataProp | ||||
|                         AttributionRequired?: ExtMetadataProp | ||||
|                         Copyrighted?: ExtMetadataProp | ||||
|                         Restrictions?: ExtMetadataProp | ||||
|                         License?: ExtMetadataProp | ||||
|                     } | ||||
|                 }[] | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| interface CategoryMember { | ||||
|     pageid: number | ||||
|     ns: number | ||||
|     title: string | ||||
| } | ||||
| 
 | ||||
| interface CategoryQueryAPIResponse { | ||||
|     batchcomplete: string | ||||
|     query: { | ||||
|         categorymembers: CategoryMember[] | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| interface TemplateQueryAPIResponse { | ||||
|     batchcomplete: string | ||||
|     query: { | ||||
|         normalized?: { | ||||
|             from: string | ||||
|             to: string | ||||
|         }[] | ||||
|         pages: { | ||||
|             [key: string]: { | ||||
|                 pageid: number | ||||
|                 ns: number | ||||
|                 title: string | ||||
|                 templates?: { | ||||
|                     ns: number | ||||
|                     title: string | ||||
|                 }[] | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| // Map license names of Wikimedia Commons to different names
 | ||||
| const licenseMapping = {} | ||||
| 
 | ||||
| // Map template names to license names
 | ||||
| const templateMapping = { | ||||
|     "Template:PD": "Public Domain", | ||||
| } | ||||
| 
 | ||||
| async function main(args: string[]) { | ||||
|     if (args.length < 2) { | ||||
|         console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ") | ||||
|         console.log( | ||||
|             "Example: npx vite-node downloadCommons.ts -- assets/svg https://commons.wikimedia.org/wiki/File:Example.jpg" | ||||
|         ) | ||||
|         process.exit(1) | ||||
|     } | ||||
|     const [outputFolder, ...urls] = args | ||||
| 
 | ||||
|     for (const url of urls) { | ||||
|         // Download details from the API
 | ||||
|         const commonsFileNamePath = url.split("/").pop() | ||||
|         if (commonsFileNamePath !== undefined) { | ||||
|             const commonsFileName = commonsFileNamePath.split("?").shift() | ||||
| 
 | ||||
|             if (commonsFileName !== undefined) { | ||||
|                 console.log(`Processing ${commonsFileName}...`) | ||||
| 
 | ||||
|                 const baseUrl = url.split("/").slice(0, 3).join("/") | ||||
| 
 | ||||
|                 // Check if it is a file or a category
 | ||||
|                 if (url.includes("Category:")) { | ||||
|                     // Download all files in the category
 | ||||
|                     const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&list=categorymembers&cmtitle=${commonsFileName}&cmlimit=250&cmtype=file` | ||||
|                     const response = await fetch(apiUrl) | ||||
|                     const apiDetails: CategoryQueryAPIResponse = await response.json() | ||||
|                     for (const member of apiDetails.query.categorymembers) { | ||||
|                         await downloadImage(member.title, outputFolder, baseUrl) | ||||
|                     } | ||||
|                 } else { | ||||
|                     await downloadImage(commonsFileName, outputFolder, baseUrl) | ||||
|                 } | ||||
|             } else { | ||||
|                 console.log( | ||||
|                     "\x1b[31m%s\x1b[0m", | ||||
|                     `URL ${url} doesn't seem to contain a filename or category! Skipping...` | ||||
|                 ) | ||||
|                 continue | ||||
|             } | ||||
|         } else { | ||||
|             console.log( | ||||
|                 "\x1b[31m%s\x1b[0m", | ||||
|                 `URL ${url} doesn't seem to be a valid URL! Skipping...` | ||||
|             ) | ||||
|             continue | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| async function downloadImage(filename: string, outputFolder: string, baseUrl: string) { | ||||
|     const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url|extmetadata|user&iimetadataversion=latest&titles=${filename}` | ||||
|     const response = await fetch(apiUrl) | ||||
|     const apiDetails: ImageQueryAPIResponse = await response.json() | ||||
|     const missingPage = apiDetails.query.pages["-1"] | ||||
| 
 | ||||
|     // Check if the file exists, locally or externally
 | ||||
|     if (missingPage !== undefined) { | ||||
|         // Image does not exist locally, check if it exists externally
 | ||||
|         if ( | ||||
|             apiDetails.query.pages["-1"].imagerepository !== "local" && | ||||
|             apiDetails.query.pages["-1"].imagerepository !== "" | ||||
|         ) { | ||||
|             // Check if we actually have image info
 | ||||
|             if (missingPage.imageinfo?.length !== undefined && missingPage.imageinfo.length > 0) { | ||||
|                 const externalUrl = missingPage.imageinfo[0].descriptionurl | ||||
|                 const externalBase = externalUrl.split("/").slice(0, 3).join("/") | ||||
| 
 | ||||
|                 const externalFilenamePath = externalUrl.split("/").pop() | ||||
|                 if (externalFilenamePath !== undefined) { | ||||
|                     const externalFilename = externalFilenamePath.split("?").shift() | ||||
|                     console.log( | ||||
|                         `\x1b[33m%s\x1b[0m`, | ||||
|                         `${filename} is external, re-running with ${externalUrl}...` | ||||
|                     ) | ||||
|                     if (externalFilename !== undefined) { | ||||
|                         await downloadImage(externalFilename, outputFolder, externalBase) | ||||
|                         return | ||||
|                     } else { | ||||
|                         // Edge case
 | ||||
|                         console.log( | ||||
|                             `\x1b[33m%s\x1b[0m`, | ||||
|                             `External URL ${externalUrl} doesn't seem to contain a filename or category! Skipping...` | ||||
|                         ) | ||||
|                     } | ||||
|                 } else { | ||||
|                     // Edge case
 | ||||
|                     console.log( | ||||
|                         `\x1b[33m%s\x1b[0m`, | ||||
|                         `External URL ${externalUrl} doesn't seem to be a valid URL! Skipping...` | ||||
|                     ) | ||||
|                     return | ||||
|                 } | ||||
|             } else { | ||||
|                 console.log( | ||||
|                     `\x1b[33m%s\x1b[0m`, | ||||
|                     `${filename} does not have image info!, skipping...` | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|         console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not exist!, skipping...`) | ||||
|     } else { | ||||
|         // Harvest useful information
 | ||||
|         const wikiPage = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]] | ||||
| 
 | ||||
|         // Check if we actually have image info
 | ||||
|         if (wikiPage.imageinfo?.length !== undefined && wikiPage.imageinfo.length > 0) { | ||||
|             const wikiUrl = wikiPage.imageinfo[0].descriptionurl | ||||
|             const fileUrl = wikiPage.imageinfo[0].url | ||||
|             const author = | ||||
|                 wikiPage.imageinfo[0].extmetadata?.Artist?.value || wikiPage.imageinfo[0].user | ||||
|             let license = wikiPage.imageinfo[0].extmetadata?.LicenseShortName?.value || null | ||||
| 
 | ||||
|             // Check if the output folder exists
 | ||||
|             if (!existsSync(outputFolder)) { | ||||
|                 const parts = outputFolder.split("/") | ||||
|                 for (let i = 0; i < parts.length; i++) { | ||||
|                     const part = parts.slice(0, i + 1).join("/") | ||||
|                     if (!existsSync(part)) { | ||||
|                         console.log(`Creating folder ${part}`) | ||||
|                         mkdirSync(part) | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // Check if the license is present
 | ||||
|             if (!license) { | ||||
|                 console.log( | ||||
|                     `${filename} does not have a license, falling back to checking template...` | ||||
|                 ) | ||||
|                 const templateUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=templates&titles=${filename}&tllimit=500` | ||||
|                 const templateResponse = await fetch(templateUrl) | ||||
|                 const templateDetails: TemplateQueryAPIResponse = await templateResponse.json() | ||||
| 
 | ||||
|                 // Loop through all templates and check if one of them is a license
 | ||||
|                 const wikiPage = | ||||
|                     templateDetails.query.pages[Object.keys(templateDetails.query.pages)[0]] | ||||
|                 if (wikiPage.templates) { | ||||
|                     for (const template of wikiPage.templates) { | ||||
|                         if (templateMapping[template.title]) { | ||||
|                             console.log( | ||||
|                                 `Found license ${templateMapping[template.title]} for ${filename}` | ||||
|                             ) | ||||
|                             license = templateMapping[template.title] | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 // If no license was found, skip the file
 | ||||
|                 if (!license) { | ||||
|                     // Log in yellow
 | ||||
|                     console.log( | ||||
|                         `\x1b[33m%s\x1b[0m`, | ||||
|                         `No license found for ${filename}, skipping...` | ||||
|                     ) | ||||
|                     return | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // Download the file and save it
 | ||||
|             const cleanFileName = unescape(filename).replace("File:", "") | ||||
|             console.log( | ||||
|                 `Downloading ${cleanFileName} from ${fileUrl} and saving it to ${outputFolder}/${cleanFileName}...` | ||||
|             ) | ||||
|             const fileResponse = await fetch(fileUrl) | ||||
|             const fileBuffer = await fileResponse.arrayBuffer() | ||||
|             const file = Buffer.from(fileBuffer) | ||||
|             const filePath = `${outputFolder}/${cleanFileName}` | ||||
|             writeFileSync(filePath, file) | ||||
| 
 | ||||
|             // Save the license information
 | ||||
|             const licenseInfo: SmallLicense = { | ||||
|                 path: cleanFileName, | ||||
|                 license: licenseMapping[license] || license, | ||||
|                 authors: [author], | ||||
|                 sources: [wikiUrl], | ||||
|             } | ||||
| 
 | ||||
|             const licensePath = `${outputFolder}/license_info.json` | ||||
|             if (!existsSync(licensePath)) { | ||||
|                 // Create the file if it doesn't exist
 | ||||
|                 writeFileSync(licensePath, JSON.stringify([licenseInfo], null, 2)) | ||||
|             } else { | ||||
|                 // Append to the file if it does exist
 | ||||
|                 const licenseFile = await readFileSync(licensePath, "utf8") | ||||
|                 const licenseData = JSON.parse(licenseFile) | ||||
|                 licenseData.push(licenseInfo) | ||||
|                 writeFileSync(licensePath, JSON.stringify(licenseData, null, 2)) | ||||
|             } | ||||
|         } else { | ||||
|             console.log(`\x1b[33m%s\x1b[0m`, `${filename} does not have image info!, skipping...`) | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| main(process.argv.slice(2)) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue