Merge develop

This commit is contained in:
Pieter Vander Vennet 2024-02-22 14:59:32 +01:00
commit 59356545ed
68 changed files with 1670 additions and 67 deletions

View file

@ -69,6 +69,30 @@ interface CategoryQueryAPIResponse {
}
}
interface ImagesQueryAPIResponse {
continue: {
imcontinue: string
continue: string
}
query: {
normalized?: {
from: string
to: string
}[]
pages: {
[key: string]: {
pageid: number
ns: number
title: string
images?: {
ns: number
title: string
}[]
}
}
}
}
interface TemplateQueryAPIResponse {
batchcomplete: string
query: {
@ -96,13 +120,14 @@ const licenseMapping = {}
// Map template names to license names
const templateMapping = {
"Template:PD": "Public Domain",
"Template:CC0": "CC0 1.0",
}
async function main(args: string[]) {
if (args.length < 2) {
console.log("Usage: downloadCommons.ts <output folder> <url> <?url> <?url> .. ")
console.log(
"Example: npx vite-node downloadCommons.ts -- assets/svg https://commons.wikimedia.org/wiki/File:Example.jpg"
"Example: npx vite-node scripts/downloadCommons.ts -- assets/svg https://commons.wikimedia.org/wiki/File:Example.jpg"
)
process.exit(1)
}
@ -128,8 +153,24 @@ async function main(args: string[]) {
for (const member of apiDetails.query.categorymembers) {
await downloadImage(member.title, outputFolder, baseUrl)
}
} else {
} else if (url.includes("File:")) {
await downloadImage(commonsFileName, outputFolder, baseUrl)
} else {
// Probably a page url, try to get all images from the page
const apiUrl = `${baseUrl}/w/api.php?action=query&format=json&prop=images&titles=${commonsFileName}&imlimit=250`
const response = await fetch(apiUrl)
const apiDetails: ImagesQueryAPIResponse = await response.json()
const page = apiDetails.query.pages[Object.keys(apiDetails.query.pages)[0]]
if (page.images) {
for (const image of page.images) {
await downloadImage(image.title, outputFolder, baseUrl)
}
} else {
console.log(
"\x1b[31m%s\x1b[0m",
`URL ${url} doesn't seem to contain any images! Skipping...`
)
}
}
} else {
console.log(
@ -154,6 +195,12 @@ async function downloadImage(filename: string, outputFolder: string, baseUrl: st
const apiDetails: ImageQueryAPIResponse = await response.json()
const missingPage = apiDetails.query.pages["-1"]
// Check if the local file already exists, if it does, skip it
if (existsSync(`${outputFolder}/${filename}`)) {
console.log(`\x1b[33m%s\x1b[0m`, `${filename} already exists, skipping...`)
return
}
// Check if the file exists, locally or externally
if (missingPage !== undefined) {
// Image does not exist locally, check if it exists externally
@ -271,8 +318,8 @@ async function downloadImage(filename: string, outputFolder: string, baseUrl: st
// Save the license information
const licenseInfo: SmallLicense = {
path: cleanFileName,
license: licenseMapping[license] || license,
authors: [author],
license: licenseMapping[license] || license.replace("CC BY", "CC-BY"),
authors: [removeLinks(author)],
sources: [wikiUrl],
}
@ -293,4 +340,9 @@ async function downloadImage(filename: string, outputFolder: string, baseUrl: st
}
}
function removeLinks(text: string): string {
// Remove <a> tags
return text.replace(/<a.*?>(.*?)<\/a>/g, "$1")
}
main(process.argv.slice(2))

View file

@ -5,7 +5,7 @@ import ScriptUtils from "./ScriptUtils"
import Script from "./Script"
const knownLanguages = ["en", "nl", "de", "fr", "es", "gl", "ca"]
const ignoreTerms = ["searchTerms"]
class TranslationPart {
contents: Map<string, TranslationPart | string> = new Map<string, TranslationPart | string>()
@ -49,6 +49,7 @@ class TranslationPart {
if (!translations.hasOwnProperty(translationsKey)) {
continue
}
const v = translations[translationsKey]
if (typeof v != "string") {
console.error(
@ -106,6 +107,9 @@ class TranslationPart {
if (!object.hasOwnProperty(key)) {
continue
}
if (ignoreTerms.indexOf(key) >= 0) {
continue
}
if (dontTranslateKeys?.indexOf(key) >= 0) {
continue

View file

@ -9,7 +9,7 @@ export class Server {
handle: {
mustMatch: string | RegExp
mimetype: string
handle: (path: string) => Promise<string>
handle: (path: string, queryParams: URLSearchParams) => Promise<string>
}[]
) {
handle.push({
@ -89,7 +89,7 @@ export class Server {
}
try {
const result = await handler.handle(path)
const result = await handler.handle(path, url.searchParams)
res.writeHead(200, { "Content-Type": handler.mimetype })
res.write(result)
res.end()

43
scripts/serverLdScrape.ts Normal file
View file

@ -0,0 +1,43 @@
import Script from "../scripts/Script"
import { Server } from "../scripts/server"
import { Utils } from "../src/Utils"
import parse from "node-html-parser"
class ServerLdScrape extends Script {
constructor() {
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
}
async main(args: string[]): Promise<void> {
const port = Number(args[0] ?? 2346)
new Server(port, {}, [
{
mustMatch: "extractgraph",
mimetype: "application/ld+json",
async handle(content, searchParams: URLSearchParams) {
const url = searchParams.get("url")
const dloaded = await Utils.download(url, {
"User-Agent":
"MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
})
const parsed = parse(dloaded)
const scripts = Array.from(parsed.getElementsByTagName("script"))
const snippets = []
for (const script of scripts) {
const tp = script.attributes["type"]
if (tp !== "application/ld+json") {
continue
}
try {
snippets.push(JSON.parse(script.textContent))
} catch (e) {
console.error(e)
}
}
return JSON.stringify(snippets)
},
},
])
}
}
new ServerLdScrape().run()