MapComplete/scripts/generateImageAnalysis.ts

467 lines
17 KiB
TypeScript
Raw Normal View History

2023-01-09 20:30:13 +01:00
import Script from "./Script"
2023-07-27 01:48:54 +02:00
import { Overpass } from "../src/Logic/Osm/Overpass"
import { RegexTag } from "../src/Logic/Tags/RegexTag"
import { ImmutableStore } from "../src/Logic/UIEventSource"
import { BBox } from "../src/Logic/BBox"
2023-01-09 20:30:13 +01:00
import * as fs from "fs"
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs"
2023-06-01 14:32:45 +02:00
import { Feature } from "geojson"
2023-01-09 20:30:13 +01:00
import ScriptUtils from "./ScriptUtils"
2023-07-27 01:48:54 +02:00
import { Imgur } from "../src/Logic/ImageProviders/Imgur"
import { LicenseInfo } from "../src/Logic/ImageProviders/LicenseInfo"
import { Utils } from "../src/Utils"
import Constants from "../src/Models/Constants"
2023-01-09 20:30:13 +01:00
export default class GenerateImageAnalysis extends Script {
2024-01-15 15:22:57 +01:00
/**
* Max N in `image:N`-keys and `imageN` keys
* @private
*/
private static readonly maxImageIndex = 31
2023-01-09 20:30:13 +01:00
constructor() {
super(
[
"Downloads (from overpass) all tags which have an imgur-image; then analyses the licenses and downloads all the images",
"",
"Arguments:",
"Path to download the images to",
"Path to save the overview to",
].join("\n")
2023-01-09 20:30:13 +01:00
)
}
async fetchImages(key: string, datapath: string, refresh: boolean): Promise<void> {
2023-01-09 20:30:13 +01:00
const targetPath = `${datapath}/features_with_${key.replace(/[:\/]/, "_")}.geojson`
if (fs.existsSync(targetPath) && !refresh) {
2023-01-09 20:30:13 +01:00
console.log("Skipping", key)
return
}
2023-01-10 02:52:09 +01:00
const tag = new RegexTag(key, /^https:\/\/i.imgur.com\/.*$/i)
2023-01-09 20:30:13 +01:00
const overpass = new Overpass(
tag,
[],
Constants.defaultOverpassUrls[0], //"https://overpass.kumi.systems/api/interpreter",
new ImmutableStore(500),
2023-01-09 20:30:13 +01:00
false
)
console.log("Starting query...")
const data = await overpass.queryGeoJson(BBox.global)
2023-06-01 14:32:45 +02:00
console.log(
"Got data:",
data[0].features.length,
"items; timestamp:",
data[1].toISOString()
)
2023-01-09 20:30:13 +01:00
fs.writeFileSync(targetPath, JSON.stringify(data[0]), "utf8")
console.log("Written", targetPath)
}
async downloadData(datapath: string, refresh: boolean): Promise<void> {
2023-01-09 20:30:13 +01:00
if (!fs.existsSync(datapath)) {
fs.mkdirSync(datapath)
}
await this.fetchImages("image", datapath, refresh)
await this.fetchImages("image:streetsign", datapath, refresh)
2024-01-15 15:22:57 +01:00
for (let i = 0; i < GenerateImageAnalysis.maxImageIndex; i++) {
await this.fetchImages("image:" + i, datapath, refresh)
2024-01-15 15:22:57 +01:00
await this.fetchImages("image" + i, datapath, refresh)
2023-01-09 20:30:13 +01:00
}
}
loadData(datapath: string): Feature[] {
const allFeatures: Feature[] = []
const files = ScriptUtils.readDirRecSync(datapath)
for (const file of files) {
if (!file.endsWith(".geojson")) {
continue
}
const contents = JSON.parse(fs.readFileSync(file, "utf8"))
allFeatures.push(...contents.features)
}
return allFeatures
}
async fetchImageMetadata(datapath: string, image: string): Promise<boolean> {
if (image === undefined) {
return false
}
2023-01-10 02:52:09 +01:00
if (!image.match(/https:\/\/i\.imgur\.com\/[a-zA-Z0-9]+\.jpg/)) {
2023-01-09 20:30:13 +01:00
return false
}
const filename = image.replace(/[\/:.\-%]/g, "_") + ".json"
const targetPath = datapath + "/" + filename
2023-01-09 20:30:13 +01:00
if (fs.existsSync(targetPath)) {
return false
}
const attribution = await Imgur.singleton.DownloadAttribution(image)
if ((attribution.artist ?? "") === "") {
// This is an invalid attribution. We save the raw response as well
const hash = image.substr("https://i.imgur.com/".length).split(".jpg")[0]
const apiUrl = "https://api.imgur.com/3/image/" + hash
const response = await Utils.downloadJsonCached(apiUrl, 365 * 24 * 60 * 60, {
Authorization: "Client-ID " + Constants.ImgurApiKey,
})
const rawTarget = datapath + "/raw/" + filename
console.log("Also storing the raw response to", rawTarget)
await fs.writeFileSync(rawTarget, JSON.stringify(response, null, " "))
}
2023-01-09 20:30:13 +01:00
await fs.writeFileSync(targetPath, JSON.stringify(attribution, null, " "))
return true
}
2023-06-01 14:32:45 +02:00
loadImageUrls(datapath: string): { allImages: Set<string>; imageSource: Map<string, string> } {
2023-01-09 20:30:13 +01:00
let allImages = new Set<string>()
const features = this.loadData(datapath)
let imageSource: Map<string, string> = new Map<string, string>()
2023-01-09 20:30:13 +01:00
for (const feature of features) {
allImages.add(feature.properties["image"])
imageSource[feature.properties["image"]] = feature.properties.id
allImages.add(feature.properties["image:streetsign"])
2023-06-01 14:32:45 +02:00
imageSource[feature.properties["image:streetsign"]] =
feature.properties.id + " (streetsign)"
2024-01-15 15:22:57 +01:00
for (let i = 0; i < GenerateImageAnalysis.maxImageIndex; i++) {
2023-01-09 20:30:13 +01:00
allImages.add(feature.properties["image:" + i])
2023-06-01 14:32:45 +02:00
imageSource[
feature.properties["image:" + i]
] = `${feature.properties.id} (image:${i})`
2024-01-15 15:22:57 +01:00
allImages.add(feature.properties["image" + i])
imageSource[
feature.properties["image" + i]
] = `${feature.properties.id} (image${i})`
2023-01-09 20:30:13 +01:00
}
}
allImages.delete(undefined)
allImages.delete(null)
imageSource.delete(undefined)
imageSource.delete(null)
2023-06-01 14:32:45 +02:00
return { allImages, imageSource }
}
async downloadMetadata(datapath: string): Promise<void> {
2023-06-01 14:32:45 +02:00
const { allImages, imageSource } = this.loadImageUrls(datapath)
2023-01-09 20:30:13 +01:00
console.log("Detected", allImages.size, "images")
let i = 0
let d = 0
let s = 0
let f = 0
let start = Date.now()
for (const image of Array.from(allImages)) {
i++
try {
const downloaded = await this.fetchImageMetadata(datapath, image)
const runningSecs = (Date.now() - start) / 1000
const left = allImages.size - i
const estimatedActualSeconds = Math.floor((left * runningSecs) / (f + d))
const estimatedActualMinutes = Math.floor(estimatedActualSeconds / 60)
const msg = `${i}/${
allImages.size
2023-05-07 23:50:39 +02:00
} downloaded: ${d},skipped: ${s}, failed: ${f}, running: ${Math.floor(
runningSecs
)}sec, ETA: ${estimatedActualMinutes}:${estimatedActualSeconds % 60}`
2023-06-01 14:32:45 +02:00
if (d + (f % 1000) === 1 || downloaded) {
ScriptUtils.erasableLog(msg)
}
2023-01-09 20:30:13 +01:00
if (downloaded) {
d++
} else {
s++
}
if (d + f == 75000) {
console.log("Used 75000 API calls, leaving 5000 for the rest of the day...")
break
2023-01-09 20:30:13 +01:00
}
} catch (e) {
// console.log(e)
2023-06-01 14:32:45 +02:00
console.log(
"Offending image hash is",
image,
"from https://openstreetmap.org/" + imageSource[image]
)
2023-01-09 20:30:13 +01:00
f++
}
}
}
async downloadViews(datapath: string): Promise<void> {
const { allImages, imageSource } = this.loadImageUrls(datapath)
console.log("Detected", allImages.size, "images")
const results: [string, number][] = []
const today = new Date().toISOString().substring(0, "YYYY-MM-DD".length)
const viewDir = datapath + "/views_" + today
if (!existsSync(viewDir)) {
mkdirSync(viewDir)
}
const targetpath = datapath + "/views.csv"
const total = allImages.size
let dloaded = 0
let skipped = 0
let err = 0
for (const image of Array.from(allImages)) {
const cachedView = viewDir + "/" + image.replace(/\//g, "_")
let attribution: LicenseInfo
if (existsSync(cachedView)) {
attribution = JSON.parse(readFileSync(cachedView, "utf8"))
skipped++
} else {
try {
attribution = await Imgur.singleton.DownloadAttribution(image)
await ScriptUtils.sleep(500)
writeFileSync(cachedView, JSON.stringify(attribution))
dloaded++
} catch (e) {
err++
continue
}
}
results.push([image, attribution.views])
if (dloaded % 50 === 0) {
console.log({
dloaded,
skipped,
total,
err,
progress: Math.round(dloaded + skipped + err),
})
}
if ((dloaded + skipped + err) % 100 === 0) {
console.log("Writing views to", targetpath)
fs.writeFileSync(targetpath, results.map((r) => r.join(",")).join("\n"))
}
}
console.log("Writing views to", targetpath)
fs.writeFileSync(targetpath, results.map((r) => r.join(",")).join("\n"))
}
async downloadImage(url: string, imagePath: string): Promise<boolean> {
const filenameLong = url.replace(/[\/:.\-%]/g, "_") + ".jpg"
const targetPathLong = imagePath + "/" + filenameLong
const filename = url.substring("https://i.imgur.com/".length)
const targetPath = imagePath + "/" + filename
if (fs.existsSync(targetPathLong)) {
if (fs.existsSync(targetPath)) {
fs.unlinkSync(targetPathLong)
console.log("Unlinking duplicate")
return false
}
console.log("Renaming...")
fs.renameSync(targetPathLong, targetPath)
return false
}
if (fs.existsSync(targetPath)) {
return false
}
await ScriptUtils.DownloadFileTo(url, targetPath)
return true
}
async downloadAllImages(datapath: string, imagePath: string): Promise<void> {
2023-06-01 14:32:45 +02:00
const { allImages } = this.loadImageUrls(datapath)
let skipped = 0
let failed = 0
let downloaded = 0
let invalid = 0
const startTime = Date.now()
2023-06-01 14:32:45 +02:00
const urls = Array.from(allImages).filter((url) => url.startsWith("https://i.imgur.com"))
for (const url of urls) {
2023-06-01 14:32:45 +02:00
const runningTime = (Date.now() - startTime) / 1000
const handled = skipped + downloaded + failed
const itemsLeft = allImages.size - handled
const speed = handled / runningTime
const timeLeft = Math.round(itemsLeft * speed)
try {
2023-06-04 22:14:23 +02:00
const urls = url.split(/[;,]/)
2023-06-01 14:32:45 +02:00
const downloadedStatus = await Promise.all(
2023-06-04 22:14:23 +02:00
urls.map((url) => this.downloadImage(url.trim(), imagePath))
2023-06-01 14:32:45 +02:00
)
for (const b of downloadedStatus) {
if (b) {
downloaded += 1
} else {
skipped += 1
}
}
2023-06-01 14:32:45 +02:00
if (downloadedStatus.some((i) => i) || skipped % 10000 === 0) {
console.log(
"Handled",
url,
JSON.stringify({
skipped,
failed,
downloaded,
invalid,
total: allImages.size,
eta: timeLeft + "s",
})
)
}
} catch (e) {
console.log(e)
failed++
}
}
}
2023-01-09 20:30:13 +01:00
analyze(datapath: string) {
const files = ScriptUtils.readDirRecSync(datapath)
const byAuthor = new Map<string, string[]>()
const byLicense = new Map<string, string[]>()
const licenseByAuthor = new Map<string, Set<string>>()
for (const file of files) {
if (!file.endsWith(".json")) {
continue
}
2023-06-01 14:32:45 +02:00
const attr = <LicenseInfo>JSON.parse(fs.readFileSync(file, { encoding: "utf8" }))
2023-01-10 02:52:09 +01:00
const license = attr.licenseShortName
2023-01-09 20:30:13 +01:00
2023-01-10 02:52:09 +01:00
if (license === undefined || attr.artist === undefined) {
continue
}
2023-01-09 20:30:13 +01:00
if (byAuthor.get(attr.artist) === undefined) {
byAuthor.set(attr.artist, [])
}
byAuthor.get(attr.artist).push(file)
if (byLicense.get(license) === undefined) {
byLicense.set(license, [])
}
byLicense.get(license).push(file)
if (licenseByAuthor.get(license) === undefined) {
licenseByAuthor.set(license, new Set<string>())
}
licenseByAuthor.get(license).add(attr.artist)
}
byAuthor.delete(undefined)
byLicense.delete(undefined)
licenseByAuthor.delete(undefined)
const byLicenseCount = Utils.MapToObj(byLicense, (a) => a.length)
const byAuthorCount = Utils.MapToObj(byAuthor, (a) => a.length)
const licenseByAuthorCount = Utils.MapToObj(licenseByAuthor, (a) => a.size)
2023-01-10 02:52:09 +01:00
const countsPerAuthor: number[] = Array.from(Object.keys(byAuthorCount)).map(
(k) => byAuthorCount[k]
)
console.log(countsPerAuthor)
countsPerAuthor.sort()
const median = countsPerAuthor[Math.floor(countsPerAuthor.length / 2)]
2023-10-30 13:44:27 +01:00
const json: {
leaderboard: { rank: number; account: string; name: string; nrOfImages: number }[]
} = {
leaderboard: [],
2023-10-17 13:31:06 +02:00
}
2023-01-10 02:52:09 +01:00
for (let i = 0; i < 100; i++) {
let maxAuthor: string = undefined
let maxCount = 0
for (const author in byAuthorCount) {
const count = byAuthorCount[author]
if (maxAuthor === undefined || count > maxCount) {
maxAuthor = author
maxCount = count
}
}
2023-10-17 13:31:06 +02:00
json.leaderboard.push({
2023-10-30 13:44:27 +01:00
rank: i + 1,
2023-10-17 13:31:06 +02:00
name: maxAuthor,
2023-10-30 13:44:27 +01:00
account: "https://openstreetmap.org/user/" + maxAuthor.replace(/ /g, "%20"),
nrOfImages: maxCount,
2023-10-17 13:31:06 +02:00
})
2023-01-10 02:52:09 +01:00
console.log(
"|",
i + 1,
"|",
`[${maxAuthor}](https://openstreetmap.org/user/${maxAuthor.replace(/ /g, "%20")})`,
"|",
maxCount,
"|"
)
delete byAuthorCount[maxAuthor]
}
2023-01-09 20:30:13 +01:00
const totalAuthors = byAuthor.size
let totalLicensedImages = 0
2023-10-17 13:31:06 +02:00
json["totalAuthors"] = totalAuthors
2023-01-09 20:30:13 +01:00
for (const license in byLicenseCount) {
totalLicensedImages += byLicenseCount[license]
}
2023-10-17 13:31:06 +02:00
json["byLicense"] = {}
2023-01-09 20:30:13 +01:00
for (const license in byLicenseCount) {
const total = byLicenseCount[license]
const authors = licenseByAuthorCount[license]
console.log(
`License ${license}: ${total} total pictures (${
Math.floor((1000 * total) / totalLicensedImages) / 10
}%), ${authors} authors (${
Math.floor((1000 * authors) / totalAuthors) / 10
}%), ${Math.floor(total / authors)} images/author`
)
2023-10-17 13:31:06 +02:00
json["byLicense"] = {
2023-10-30 13:44:27 +01:00
license,
total,
authors,
2023-10-17 13:31:06 +02:00
}
2023-01-09 20:30:13 +01:00
}
2023-01-10 02:52:09 +01:00
const nonDefaultAuthors = [
...Array.from(licenseByAuthor.get("CC-BY 4.0").values()),
...Array.from(licenseByAuthor.get("CC-BY-SA 4.0").values()),
]
2023-06-01 14:32:45 +02:00
console.log(
"Total number of correctly licenses pictures: ",
totalLicensedImages,
"(out of ",
files.length,
" images)"
)
2023-01-10 02:52:09 +01:00
console.log("Total number of authors:", byAuthor.size)
console.log(
"Total number of authors which used a valid, non CC0 license at one point in time",
nonDefaultAuthors.length
)
console.log("Median contributions per author:", median)
2023-10-17 13:31:06 +02:00
json["median"] = median
json["date"] = new Date().toISOString()
2023-10-30 13:44:27 +01:00
writeFileSync(
"../../git/MapComplete-data/picture-leaderboard.json",
JSON.stringify(json),
"utf8"
)
2023-01-09 20:30:13 +01:00
}
async main(args: string[]): Promise<void> {
console.log("Usage: [--cached] to use the cached osm data")
console.log("Args are", args)
const cached = args.indexOf("--cached") < 0
2023-06-01 14:32:45 +02:00
args = args.filter((a) => a !== "--cached")
const datapath = args[1] ?? "../../git/MapComplete-data/ImageLicenseInfo"
const imageBackupPath = args[0]
if(imageBackupPath === "" || imageBackupPath === undefined){
throw "No imageBackup path specified"
}
await this.downloadData(datapath, cached)
2023-01-09 20:30:13 +01:00
// await this.downloadViews(datapath)
await this.downloadMetadata(datapath)
await this.downloadAllImages(datapath, imageBackupPath)
2023-01-09 20:30:13 +01:00
this.analyze(datapath)
}
}
new GenerateImageAnalysis().run()