2024-02-22 14:59:05 +01:00
|
|
|
import Script from "../scripts/Script"
|
2024-02-26 16:11:41 +01:00
|
|
|
import { Server } from "./server"
|
2024-02-22 14:59:05 +01:00
|
|
|
import { Utils } from "../src/Utils"
|
|
|
|
import parse from "node-html-parser"
|
|
|
|
class ServerLdScrape extends Script {
|
|
|
|
constructor() {
|
|
|
|
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
|
|
|
|
}
|
|
|
|
async main(args: string[]): Promise<void> {
|
|
|
|
const port = Number(args[0] ?? 2346)
|
2024-02-22 15:21:04 +01:00
|
|
|
const cache: Record<string, { date: Date; contents: any }> = {}
|
2024-02-22 14:59:05 +01:00
|
|
|
new Server(port, {}, [
|
|
|
|
{
|
|
|
|
mustMatch: "extractgraph",
|
|
|
|
mimetype: "application/ld+json",
|
|
|
|
async handle(content, searchParams: URLSearchParams) {
|
|
|
|
const url = searchParams.get("url")
|
2024-02-22 15:21:04 +01:00
|
|
|
if (cache[url] !== undefined) {
|
|
|
|
const { date, contents } = cache[url]
|
|
|
|
// In seconds
|
|
|
|
const tdiff = (new Date().getTime() - date.getTime()) / 1000
|
|
|
|
if (tdiff < 24 * 60 * 60) {
|
|
|
|
return contents
|
|
|
|
}
|
2024-02-22 18:58:34 +01:00
|
|
|
}
|
2024-02-22 14:59:05 +01:00
|
|
|
const dloaded = await Utils.download(url, {
|
|
|
|
"User-Agent":
|
2024-02-22 15:21:04 +01:00
|
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", // MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
|
2024-02-22 14:59:05 +01:00
|
|
|
})
|
2024-02-22 18:58:34 +01:00
|
|
|
// return dloaded
|
2024-02-22 14:59:05 +01:00
|
|
|
const parsed = parse(dloaded)
|
|
|
|
const scripts = Array.from(parsed.getElementsByTagName("script"))
|
|
|
|
for (const script of scripts) {
|
|
|
|
const tp = script.attributes["type"]
|
|
|
|
if (tp !== "application/ld+json") {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
try {
|
2024-02-22 18:58:34 +01:00
|
|
|
const snippet = JSON.parse(script.textContent)
|
|
|
|
snippet["@base"] = url
|
|
|
|
cache[url] = snippet
|
|
|
|
|
|
|
|
return JSON.stringify(snippet)
|
2024-02-22 14:59:05 +01:00
|
|
|
} catch (e) {
|
|
|
|
console.error(e)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
},
|
|
|
|
])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
new ServerLdScrape().run()
|