forked from MapComplete/MapComplete
		
	Scripts: Update imageAnalysis script to also backup the images
This commit is contained in:
		
							parent
							
								
									e5cc7eec71
								
							
						
					
					
						commit
						088fbe1d07
					
				
					 3 changed files with 156 additions and 48 deletions
				
			
		|  | @ -1,3 +1,5 @@ | ||||||
|  | (To rerun the analysis: use 'scripts/generateImageAnalysis'. Delete 'features_with_*.geojson' first to force updating the OSM-dataset) | ||||||
|  | 
 | ||||||
| # What licenses are used? | # What licenses are used? | ||||||
| 
 | 
 | ||||||
| Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images. | Now that MapComplete is three-and-a-half year old, it's a good time to see what license people are using to upload their images. | ||||||
|  |  | ||||||
|  | @ -37,14 +37,16 @@ export default class ScriptUtils { | ||||||
|         return result |         return result | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     public static DownloadFileTo(url, targetFilePath: string): void { |     public static DownloadFileTo(url, targetFilePath: string): Promise<void> { | ||||||
|         console.log("Downloading ", url, "to", targetFilePath) |         ScriptUtils.erasableLog("Downloading", url, "to", targetFilePath) | ||||||
|  |         return new Promise<void>((resolve, err) => { | ||||||
|             https.get(url, (res) => { |             https.get(url, (res) => { | ||||||
|                 const filePath = fs.createWriteStream(targetFilePath) |                 const filePath = fs.createWriteStream(targetFilePath) | ||||||
|                 res.pipe(filePath) |                 res.pipe(filePath) | ||||||
|                 filePath.on("finish", () => { |                 filePath.on("finish", () => { | ||||||
|                     filePath.close() |                     filePath.close() | ||||||
|                 console.log("Download Completed") |                     resolve() | ||||||
|  |                 }) | ||||||
|             }) |             }) | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -9,6 +9,7 @@ import ScriptUtils from "./ScriptUtils" | ||||||
| import {Imgur} from "../Logic/ImageProviders/Imgur" | import {Imgur} from "../Logic/ImageProviders/Imgur" | ||||||
| import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo" | import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo" | ||||||
| import {Utils} from "../Utils" | import {Utils} from "../Utils" | ||||||
|  | import Constants from "../Models/Constants"; | ||||||
| 
 | 
 | ||||||
| export default class GenerateImageAnalysis extends Script { | export default class GenerateImageAnalysis extends Script { | ||||||
|     constructor() { |     constructor() { | ||||||
|  | @ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|         ) |         ) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     async fetchImages(key: string, datapath: string): Promise<void> { |     async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> { | ||||||
|         const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson` |         const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson` | ||||||
|         if (fs.existsSync(targetPath)) { |         if (fs.existsSync(targetPath) && !refresh) { | ||||||
|             console.log("Skipping", key) |             console.log("Skipping", key) | ||||||
|             return |             return | ||||||
|         } |         } | ||||||
|  | @ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|         const overpass = new Overpass( |         const overpass = new Overpass( | ||||||
|             tag, |             tag, | ||||||
|             [], |             [], | ||||||
|             "https://overpass.kumi.systems/api/interpreter", |             Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
 | ||||||
|             new ImmutableStore(500), |             new ImmutableStore(500), | ||||||
|             undefined, |             undefined, | ||||||
|             false |             false | ||||||
|         ) |         ) | ||||||
|         console.log("Starting query...") |         console.log("Starting query...") | ||||||
|         const data = await overpass.queryGeoJson(BBox.global) |         const data = await overpass.queryGeoJson(BBox.global) | ||||||
|         console.log("Got data: ", data[0].features.length) |         console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString()) | ||||||
|         fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8") |         fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8") | ||||||
|         console.log("Written", targetPath) |         console.log("Written", targetPath) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     async downloadData(datapath: string): Promise<void> { |     async downloadData(datapath: string, refresh: boolean): Promise<void> { | ||||||
|         if (!fs.existsSync(datapath)) { |         if (!fs.existsSync(datapath)) { | ||||||
|             fs.mkdirSync(datapath) |             fs.mkdirSync(datapath) | ||||||
|         } |         } | ||||||
| 
 |         await this.fetchImages("image", datapath, refresh) | ||||||
|         await this.fetchImages("image", datapath) |         await this.fetchImages("image:streetsign", datapath, refresh) | ||||||
|         await this.fetchImages("image:streetsign", datapath) |  | ||||||
|         for (let i = 0; i < 5; i++) { |         for (let i = 0; i < 5; i++) { | ||||||
|             await this.fetchImages("image:" + i, datapath) |             await this.fetchImages("image:" + i, datapath, refresh) | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | @ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|         if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) { |         if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) { | ||||||
|             return false |             return false | ||||||
|         } |         } | ||||||
|         const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json" |         const filename = image.replace(/[\/:.\-%]/g, "_") + ".json" | ||||||
|  |         const targetPath = datapath + "/" + filename | ||||||
|         if (fs.existsSync(targetPath)) { |         if (fs.existsSync(targetPath)) { | ||||||
|             return false |             return false | ||||||
|         } |         } | ||||||
|         const attribution = await Imgur.singleton.DownloadAttribution(image) |         const attribution = await Imgur.singleton.DownloadAttribution(image) | ||||||
|  | 
 | ||||||
|  |         if ((attribution.artist ?? "") === "") { | ||||||
|  |             // This is an invalid attribution. We save the raw response as well
 | ||||||
|  |             const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0] | ||||||
|  | 
 | ||||||
|  |             const apiUrl = "https://api.imgur.com/3/image/" + hash | ||||||
|  |             const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, { | ||||||
|  |                 Authorization: "Client-ID " + Constants.ImgurApiKey, | ||||||
|  |             }) | ||||||
|  |             const rawTarget = datapath + "/raw/" + filename | ||||||
|  |             console.log("Also storing the raw response to", rawTarget) | ||||||
|  |             await fs.writeFileSync(rawTarget, JSON.stringify(response, null, "    ")) | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|         await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, "    ")) |         await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, "    ")) | ||||||
|         return true |         return true | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     async downloadMetadata(datapath: string): Promise<void> { |     loadImageUrls(datapath: string): { allImages: Set<string>, imageSource: Map<string, string> } { | ||||||
|         const features = this.loadData(datapath) |  | ||||||
|         let allImages = new Set<string>() |         let allImages = new Set<string>() | ||||||
|  |         const features = this.loadData(datapath) | ||||||
|  |         let imageSource: Map<string, string> = new Map<string, string>() | ||||||
| 
 | 
 | ||||||
|         for (const feature of features) { |         for (const feature of features) { | ||||||
|             allImages.add(feature.properties["image"]) |             allImages.add(feature.properties["image"]) | ||||||
|  |             imageSource[feature.properties["image"]] = feature.properties.id | ||||||
|  |             allImages.add(feature.properties["image:streetsign"]) | ||||||
|  |             imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)" | ||||||
|  | 
 | ||||||
|             for (let i = 0; i < 10; i++) { |             for (let i = 0; i < 10; i++) { | ||||||
|                 allImages.add(feature.properties["image:" + i]) |                 allImages.add(feature.properties["image:" + i]) | ||||||
|  |                 imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})` | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |         allImages.delete(undefined) | ||||||
|  |         allImages.delete(null) | ||||||
|  |         imageSource.delete(undefined) | ||||||
|  |         imageSource.delete(null) | ||||||
|  |         return {allImages, imageSource} | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     async downloadMetadata(datapath: string): Promise<void> { | ||||||
|  |         const {allImages, imageSource} = this.loadImageUrls(datapath) | ||||||
|         console.log("Detected", allImages.size, "images") |         console.log("Detected", allImages.size, "images") | ||||||
|         let i = 0 |         let i = 0 | ||||||
|         let d = 0 |         let d = 0 | ||||||
|  | @ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|                 } downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor( |                 } downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor( | ||||||
|                     runningSecs |                     runningSecs | ||||||
|                 )}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}` |                 )}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}` | ||||||
|                 ScriptUtils.erasableLog( |                 if (d + f % 1000 === 1 || downloaded) { | ||||||
|                     "                                                                                                              ", |                     ScriptUtils.erasableLog(msg) | ||||||
|                     msg |                 } | ||||||
|                 ) |  | ||||||
|                 if (downloaded) { |                 if (downloaded) { | ||||||
|                     d++ |                     d++ | ||||||
|                 } else { |                 } else { | ||||||
|  | @ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|                 } |                 } | ||||||
|                 if (d + f == 75000) { |                 if (d + f == 75000) { | ||||||
|                     console.log("Used 75000 API calls, leaving 5000 for the rest of the day...") |                     console.log("Used 75000 API calls, leaving 5000 for the rest of the day...") | ||||||
|  |                     break | ||||||
|  |                 } | ||||||
|  |             } catch (e) { | ||||||
|  |                 // console.log(e)
 | ||||||
|  |                 console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image]) | ||||||
|  |                 f++ | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     async downloadImage(url: string, imagePath: string): Promise<boolean> { | ||||||
|  |         const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg" | ||||||
|  |         const targetPathLong = imagePath + "/" + filenameLong | ||||||
|  | 
 | ||||||
|  |         const filename = url.substring("https://i.imgur.com/".length) | ||||||
|  |         const targetPath = imagePath + "/" + filename | ||||||
|  |         if (fs.existsSync(targetPathLong)) { | ||||||
|  |             if (fs.existsSync(targetPath)) { | ||||||
|  |                 fs.unlinkSync(targetPathLong) | ||||||
|  |                 console.log("Unlinking duplicate") | ||||||
|  |                 return false | ||||||
|  |             } | ||||||
|  |             console.log("Renaming...") | ||||||
|  |             fs.renameSync(targetPathLong, targetPath) | ||||||
|  |             return false | ||||||
|  |         } | ||||||
|  |         if (fs.existsSync(targetPath)) { | ||||||
|  |             return false | ||||||
|  |         } | ||||||
|  |         await ScriptUtils.DownloadFileTo(url, targetPath) | ||||||
|  |         return true | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     async downloadAllImages(datapath: string, imagePath: string): Promise<void> { | ||||||
|  |         const {allImages} = this.loadImageUrls(datapath) | ||||||
|  |         let skipped = 0 | ||||||
|  |         let failed = 0 | ||||||
|  |         let downloaded = 0 | ||||||
|  |         let invalid = 0 | ||||||
|  |         const startTime = Date.now() | ||||||
|  |         const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com")) | ||||||
|  |         for (const url of urls) { | ||||||
|  |             const runningTime = ((Date.now()) - startTime) / 1000 | ||||||
|  |             const handled = skipped + downloaded + failed | ||||||
|  |             const itemsLeft = allImages.size - handled | ||||||
|  |             const speed = handled / runningTime | ||||||
|  |             const timeLeft = Math.round(itemsLeft * speed) | ||||||
|  |             try { | ||||||
|  |                 const downloadedStatus = await Promise.all(url.split(";").map(url => | ||||||
|  |                     this.downloadImage(url.trim(), imagePath), | ||||||
|  |                 )) | ||||||
|  | 
 | ||||||
|  |                 for (const b of downloadedStatus) { | ||||||
|  |                     if (b) { | ||||||
|  |                         downloaded += 1 | ||||||
|  |                     } else { | ||||||
|  |                         skipped += 1 | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |                 if (downloadedStatus.some(i => i) || skipped % 10000 === 0) { | ||||||
|  | 
 | ||||||
|  |                     console.log("Handled", url, JSON.stringify({ | ||||||
|  |                         skipped, | ||||||
|  |                         failed, | ||||||
|  |                         downloaded, | ||||||
|  |                         invalid, | ||||||
|  |                         total: allImages.size, | ||||||
|  |                         eta: timeLeft + "s" | ||||||
|  |                     })) | ||||||
|                 } |                 } | ||||||
|             } catch (e) { |             } catch (e) { | ||||||
|                 console.log(e) |                 console.log(e) | ||||||
|                 f++ |                 failed++ | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  | @ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|             ...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()), |             ...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()), | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         console.log("Total number of correctly licenses pictures: ", totalLicensedImages) |         console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)") | ||||||
|         console.log("Total number of authors:", byAuthor.size) |         console.log("Total number of authors:", byAuthor.size) | ||||||
|         console.log( |         console.log( | ||||||
|             "Total number of authors which used a valid, non CC0 license at one point in time", |             "Total number of authors which used a valid, non CC0 license at one point in time", | ||||||
|  | @ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script { | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     async main(args: string[]): Promise<void> { |     async main(args: string[]): Promise<void> { | ||||||
|  |         console.log("Usage: [--cached] to use the cached osm data") | ||||||
|  |         console.log("Args are", args) | ||||||
|  |         const cached = args.indexOf("--cached") < 0 | ||||||
|  |         args = args.filter(a => a !== "--cached") | ||||||
|         const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo" |         const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo" | ||||||
|         await this.downloadData(datapath) |         await this.downloadData(datapath, cached) | ||||||
| 
 | 
 | ||||||
|         await this.downloadMetadata(datapath) |         await this.downloadMetadata(datapath) | ||||||
|  |         await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup") | ||||||
|         this.analyze(datapath) |         this.analyze(datapath) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue