MapComplete/scripts/serverLdScrape.ts

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

77 lines
3.3 KiB
TypeScript
Raw Normal View History

2024-02-22 14:59:05 +01:00
import Script from "../scripts/Script"
2024-02-26 16:11:41 +01:00
import { Server } from "./server"
2024-02-22 14:59:05 +01:00
import parse from "node-html-parser"
import ScriptUtils from "./ScriptUtils"
2024-02-22 14:59:05 +01:00
class ServerLdScrape extends Script {
constructor() {
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
}
2024-02-22 14:59:05 +01:00
async main(args: string[]): Promise<void> {
const port = Number(args[0] ?? 2346)
2024-02-22 15:21:04 +01:00
const cache: Record<string, { date: Date; contents: any }> = {}
2024-02-22 14:59:05 +01:00
new Server(port, {}, [
{
mustMatch: "extractgraph",
mimetype: "application/ld+json",
addHeaders: {
"Cache-control":"max-age=3600, public"
},
2024-02-22 14:59:05 +01:00
async handle(content, searchParams: URLSearchParams) {
const url = searchParams.get("url")
console.log("URL", url)
2024-02-22 15:21:04 +01:00
if (cache[url] !== undefined) {
const { date, contents } = cache[url]
2024-02-27 03:07:48 +01:00
console.log(">>>", date, contents)
2024-02-22 15:21:04 +01:00
// In seconds
2024-02-27 03:07:48 +01:00
const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000
2024-02-22 15:21:04 +01:00
if (tdiff < 24 * 60 * 60) {
2024-02-27 03:07:48 +01:00
return JSON.stringify(contents)
2024-02-22 15:21:04 +01:00
}
}
let dloaded: { content: string } | { redirect: string } | "timeout" = { redirect: url }
do {
dloaded = await ScriptUtils.Download(dloaded["redirect"], {
"User-Agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", // MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
}, 10)
if (dloaded === "timeout") {
return "{\"#\":\"timout reached\"}"
}
} while (dloaded["redirect"])
if(dloaded["content"].startsWith("{")){
// This is probably a json
const snippet = JSON.parse(dloaded["content"])
console.log("Snippet is", snippet)
cache[url] = { contents: snippet, date: new Date() }
return JSON.stringify(snippet)
}
const parsed = parse(dloaded["content"])
2024-02-22 14:59:05 +01:00
const scripts = Array.from(parsed.getElementsByTagName("script"))
for (const script of scripts) {
const tp = script.attributes["type"]
if (tp !== "application/ld+json") {
continue
}
try {
const snippet = JSON.parse(script.textContent)
snippet["@base"] = url
2024-02-27 03:07:48 +01:00
cache[url] = { contents: snippet, date: new Date() }
return JSON.stringify(snippet)
2024-02-22 14:59:05 +01:00
} catch (e) {
console.error(e)
}
}
},
},
])
}
}
new ServerLdScrape().run()