forked from MapComplete/MapComplete
		
	Scripts: Update imageAnalysis script to also backup the images
This commit is contained in:
		
							parent
							
								
									e5cc7eec71
								
							
						
					
					
						commit
						088fbe1d07
					
				
					 3 changed files with 156 additions and 48 deletions
				
			
		|  | @ -1,14 +1,15 @@ | |||
| import Script from "./Script" | ||||
| import { Overpass } from "../Logic/Osm/Overpass" | ||||
| import { RegexTag } from "../Logic/Tags/RegexTag" | ||||
| import { ImmutableStore } from "../Logic/UIEventSource" | ||||
| import { BBox } from "../Logic/BBox" | ||||
| import {Overpass} from "../Logic/Osm/Overpass" | ||||
| import {RegexTag} from "../Logic/Tags/RegexTag" | ||||
| import {ImmutableStore} from "../Logic/UIEventSource" | ||||
| import {BBox} from "../Logic/BBox" | ||||
| import * as fs from "fs" | ||||
| import { Feature } from "geojson" | ||||
| import {Feature} from "geojson" | ||||
| import ScriptUtils from "./ScriptUtils" | ||||
| import { Imgur } from "../Logic/ImageProviders/Imgur" | ||||
| import { LicenseInfo } from "../Logic/ImageProviders/LicenseInfo" | ||||
| import { Utils } from "../Utils" | ||||
| import {Imgur} from "../Logic/ImageProviders/Imgur" | ||||
| import {LicenseInfo} from "../Logic/ImageProviders/LicenseInfo" | ||||
| import {Utils} from "../Utils" | ||||
| import Constants from "../Models/Constants"; | ||||
| 
 | ||||
| export default class GenerateImageAnalysis extends Script { | ||||
|     constructor() { | ||||
|  | @ -17,9 +18,9 @@ export default class GenerateImageAnalysis extends Script { | |||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     async fetchImages(key: string, datapath: string): Promise<void> { | ||||
|     async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> { | ||||
|         const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson` | ||||
|         if (fs.existsSync(targetPath)) { | ||||
|         if (fs.existsSync(targetPath) && !refresh) { | ||||
|             console.log("Skipping", key) | ||||
|             return | ||||
|         } | ||||
|  | @ -27,27 +28,26 @@ export default class GenerateImageAnalysis extends Script { | |||
|         const overpass = new Overpass( | ||||
|             tag, | ||||
|             [], | ||||
|             "https://overpass.kumi.systems/api/interpreter", | ||||
|             Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
 | ||||
|             new ImmutableStore(500), | ||||
|             undefined, | ||||
|             false | ||||
|         ) | ||||
|         console.log("Starting query...") | ||||
|         const data = await overpass.queryGeoJson(BBox.global) | ||||
|         console.log("Got data: ", data[0].features.length) | ||||
|         console.log("Got data:", data[0].features.length, "items; timestamp:", data[1].toISOString()) | ||||
|         fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8") | ||||
|         console.log("Written", targetPath) | ||||
|     } | ||||
| 
 | ||||
|     async downloadData(datapath: string): Promise<void> { | ||||
|     async downloadData(datapath: string, refresh: boolean): Promise<void> { | ||||
|         if (!fs.existsSync(datapath)) { | ||||
|             fs.mkdirSync(datapath) | ||||
|         } | ||||
| 
 | ||||
|         await this.fetchImages("image", datapath) | ||||
|         await this.fetchImages("image:streetsign", datapath) | ||||
|         await this.fetchImages("image", datapath, refresh) | ||||
|         await this.fetchImages("image:streetsign", datapath, refresh) | ||||
|         for (let i = 0; i < 5; i++) { | ||||
|             await this.fetchImages("image:" + i, datapath) | ||||
|             await this.fetchImages("image:" + i, datapath, refresh) | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|  | @ -73,25 +73,55 @@ export default class GenerateImageAnalysis extends Script { | |||
|         if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) { | ||||
|             return false | ||||
|         } | ||||
|         const targetPath = datapath + "/" + image.replace(/[\/:.\-%]/g, "_") + ".json" | ||||
|         const filename = image.replace(/[\/:.\-%]/g, "_") + ".json" | ||||
|         const targetPath = datapath + "/" + filename | ||||
|         if (fs.existsSync(targetPath)) { | ||||
|             return false | ||||
|         } | ||||
|         const attribution = await Imgur.singleton.DownloadAttribution(image) | ||||
| 
 | ||||
|         if ((attribution.artist ?? "") === "") { | ||||
|             // This is an invalid attribution. We save the raw response as well
 | ||||
|             const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0] | ||||
| 
 | ||||
|             const apiUrl = "https://api.imgur.com/3/image/" + hash | ||||
|             const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, { | ||||
|                 Authorization: "Client-ID " + Constants.ImgurApiKey, | ||||
|             }) | ||||
|             const rawTarget = datapath + "/raw/" + filename | ||||
|             console.log("Also storing the raw response to", rawTarget) | ||||
|             await fs.writeFileSync(rawTarget, JSON.stringify(response, null, "    ")) | ||||
|         } | ||||
| 
 | ||||
|         await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, "    ")) | ||||
|         return true | ||||
|     } | ||||
| 
 | ||||
|     async downloadMetadata(datapath: string): Promise<void> { | ||||
|         const features = this.loadData(datapath) | ||||
|     loadImageUrls(datapath: string): { allImages: Set<string>, imageSource: Map<string, string> } { | ||||
|         let allImages = new Set<string>() | ||||
|         const features = this.loadData(datapath) | ||||
|         let imageSource: Map<string, string> = new Map<string, string>() | ||||
| 
 | ||||
|         for (const feature of features) { | ||||
|             allImages.add(feature.properties["image"]) | ||||
|             imageSource[feature.properties["image"]] = feature.properties.id | ||||
|             allImages.add(feature.properties["image:streetsign"]) | ||||
|             imageSource[feature.properties["image:streetsign"]] = feature.properties.id + " (streetsign)" | ||||
| 
 | ||||
|             for (let i = 0; i < 10; i++) { | ||||
|                 allImages.add(feature.properties["image:" + i]) | ||||
|                 imageSource[feature.properties["image:" + i]] = `${feature.properties.id} (image:${i})` | ||||
|             } | ||||
|         } | ||||
|         allImages.delete(undefined) | ||||
|         allImages.delete(null) | ||||
|         imageSource.delete(undefined) | ||||
|         imageSource.delete(null) | ||||
|         return {allImages, imageSource} | ||||
|     } | ||||
| 
 | ||||
|     async downloadMetadata(datapath: string): Promise<void> { | ||||
|         const {allImages, imageSource} = this.loadImageUrls(datapath) | ||||
|         console.log("Detected", allImages.size, "images") | ||||
|         let i = 0 | ||||
|         let d = 0 | ||||
|  | @ -113,10 +143,9 @@ export default class GenerateImageAnalysis extends Script { | |||
|                 } downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor( | ||||
|                     runningSecs | ||||
|                 )}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}` | ||||
|                 ScriptUtils.erasableLog( | ||||
|                     "                                                                                                              ", | ||||
|                     msg | ||||
|                 ) | ||||
|                 if (d + f % 1000 === 1 || downloaded) { | ||||
|                     ScriptUtils.erasableLog(msg) | ||||
|                 } | ||||
|                 if (downloaded) { | ||||
|                     d++ | ||||
|                 } else { | ||||
|  | @ -124,10 +153,80 @@ export default class GenerateImageAnalysis extends Script { | |||
|                 } | ||||
|                 if (d + f == 75000) { | ||||
|                     console.log("Used 75000 API calls, leaving 5000 for the rest of the day...") | ||||
|                     break | ||||
|                 } | ||||
|             } catch (e) { | ||||
|                 // console.log(e)
 | ||||
|                 console.log("Offending image hash is", image, "from https://openstreetmap.org/" + imageSource[image]) | ||||
|                 f++ | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     async downloadImage(url: string, imagePath: string): Promise<boolean> { | ||||
|         const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg" | ||||
|         const targetPathLong = imagePath + "/" + filenameLong | ||||
| 
 | ||||
|         const filename = url.substring("https://i.imgur.com/".length) | ||||
|         const targetPath = imagePath + "/" + filename | ||||
|         if (fs.existsSync(targetPathLong)) { | ||||
|             if (fs.existsSync(targetPath)) { | ||||
|                 fs.unlinkSync(targetPathLong) | ||||
|                 console.log("Unlinking duplicate") | ||||
|                 return false | ||||
|             } | ||||
|             console.log("Renaming...") | ||||
|             fs.renameSync(targetPathLong, targetPath) | ||||
|             return false | ||||
|         } | ||||
|         if (fs.existsSync(targetPath)) { | ||||
|             return false | ||||
|         } | ||||
|         await ScriptUtils.DownloadFileTo(url, targetPath) | ||||
|         return true | ||||
|     } | ||||
| 
 | ||||
|     async downloadAllImages(datapath: string, imagePath: string): Promise<void> { | ||||
|         const {allImages} = this.loadImageUrls(datapath) | ||||
|         let skipped = 0 | ||||
|         let failed = 0 | ||||
|         let downloaded = 0 | ||||
|         let invalid = 0 | ||||
|         const startTime = Date.now() | ||||
|         const urls = Array.from(allImages).filter(url => url.startsWith("https://i.imgur.com")) | ||||
|         for (const url of urls) { | ||||
|             const runningTime = ((Date.now()) - startTime) / 1000 | ||||
|             const handled = skipped + downloaded + failed | ||||
|             const itemsLeft = allImages.size - handled | ||||
|             const speed = handled / runningTime | ||||
|             const timeLeft = Math.round(itemsLeft * speed) | ||||
|             try { | ||||
|                 const downloadedStatus = await Promise.all(url.split(";").map(url => | ||||
|                     this.downloadImage(url.trim(), imagePath), | ||||
|                 )) | ||||
| 
 | ||||
|                 for (const b of downloadedStatus) { | ||||
|                     if (b) { | ||||
|                         downloaded += 1 | ||||
|                     } else { | ||||
|                         skipped += 1 | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 if (downloadedStatus.some(i => i) || skipped % 10000 === 0) { | ||||
| 
 | ||||
|                     console.log("Handled", url, JSON.stringify({ | ||||
|                         skipped, | ||||
|                         failed, | ||||
|                         downloaded, | ||||
|                         invalid, | ||||
|                         total: allImages.size, | ||||
|                         eta: timeLeft + "s" | ||||
|                     })) | ||||
|                 } | ||||
|             } catch (e) { | ||||
|                 console.log(e) | ||||
|                 f++ | ||||
|                 failed++ | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | @ -141,7 +240,7 @@ export default class GenerateImageAnalysis extends Script { | |||
|             if (!file.endsWith(".json")) { | ||||
|                 continue | ||||
|             } | ||||
|             const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, { encoding: "utf8" })) | ||||
|             const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, {encoding: "utf8"})) | ||||
|             const license = attr.licenseShortName | ||||
| 
 | ||||
|             if (license === undefined || attr.artist === undefined) { | ||||
|  | @ -220,7 +319,7 @@ export default class GenerateImageAnalysis extends Script { | |||
|             ...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()), | ||||
|         ] | ||||
| 
 | ||||
|         console.log("Total number of correctly licenses pictures: ", totalLicensedImages) | ||||
|         console.log("Total number of correctly licenses pictures: ", totalLicensedImages, "(out of ", files.length, " images)") | ||||
|         console.log("Total number of authors:", byAuthor.size) | ||||
|         console.log( | ||||
|             "Total number of authors which used a valid, non CC0 license at one point in time", | ||||
|  | @ -230,10 +329,15 @@ export default class GenerateImageAnalysis extends Script { | |||
|     } | ||||
| 
 | ||||
|     async main(args: string[]): Promise<void> { | ||||
|         console.log("Usage: [--cached] to use the cached osm data") | ||||
|         console.log("Args are", args) | ||||
|         const cached = args.indexOf("--cached") < 0 | ||||
|         args = args.filter(a => a !== "--cached") | ||||
|         const datapath = args[0] ?? "../../git/MapComplete-data/ImageLicenseInfo" | ||||
|         await this.downloadData(datapath) | ||||
|         await this.downloadData(datapath, cached) | ||||
| 
 | ||||
|         await this.downloadMetadata(datapath) | ||||
|         await this.downloadAllImages(datapath, "/home/pietervdvn/data/imgur-image-backup") | ||||
|         this.analyze(datapath) | ||||
|     } | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue