import Script from "../scripts/Script" import { Server } from "./server" import parse from "node-html-parser" import ScriptUtils from "./ScriptUtils" class ServerLdScrape extends Script { constructor() { super("Starts a server which fetches a webpage and returns embedded LD+JSON") } async main(args: string[]): Promise { const port = Number(args[0] ?? 2346) const cache: Record = {} new Server(port, {}, [ { mustMatch: "extractgraph", mimetype: "application/ld+json", async handle(content, searchParams: URLSearchParams) { const url = searchParams.get("url") if (cache[url] !== undefined) { const { date, contents } = cache[url] console.log(">>>", date, contents) // In seconds const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000 if (tdiff < 24 * 60 * 60) { return JSON.stringify(contents) } } let dloaded: { content: string } | { redirect: string } | "timeout" = { redirect: url } do { dloaded = await ScriptUtils.Download(dloaded["redirect"], { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", // MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", }, 10) if (dloaded === "timeout") { return "{\"#\":\"timout reached\"}" } } while (dloaded["redirect"]) const parsed = parse(dloaded["content"]) const scripts = Array.from(parsed.getElementsByTagName("script")) for (const script of scripts) { const tp = script.attributes["type"] if (tp !== "application/ld+json") { continue } try { const snippet = JSON.parse(script.textContent) snippet["@base"] = url cache[url] = { contents: snippet, date: new Date() } return JSON.stringify(snippet) } catch (e) { console.error(e) } } }, }, ]) } } new ServerLdScrape().run()