2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import Script from "../scripts/Script"
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 16:11:41 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import { Server } from "./server"
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import parse from "node-html-parser"
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import ScriptUtils from "./ScriptUtils"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class ServerLdScrape extends Script {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    constructor() {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        super("Starts a server which fetches a webpage and returns embedded LD+JSON")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    private static async attemptDownload(url: string) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const host = new URL(url).host
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        const random = Math.floor(Math.random() * 100)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const random1 = Math.floor(Math.random() * 100)
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        const headers = [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            {
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                "User-Agent": `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                accept: "application/html",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            },
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            /* {
							 | 
						
					
						
							
								
									
										
										
										
											2025-04-07 19:10:11 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                "User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://source.mapcomplete.org/MapComplete",
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                "accept": "application/html"
							 | 
						
					
						
							
								
									
										
										
										
											2025-04-17 02:25:51 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            },*/
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for (let i = 0; i < headers.length; i++) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            try {
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                return await ScriptUtils.Download(url, headers[i], 10)
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            } catch (e) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                console.error("Could not download", url, "with headers", headers[i], "due to", e)
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    async main(args: string[]): Promise<void> {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const port = Number(args[0] ?? 2346)
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-14 03:34:07 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        const start = new Date()
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 15:21:04 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        const cache: Record<string, { date: Date; contents: any }> = {}
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        new Server(port, {}, [
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-21 10:52:51 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mustMatch: "status",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mimetype: "application/json",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                handle: async () => {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    return JSON.stringify({
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        online: true,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        cached_entries: Object.keys(cache).length,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        booted: start,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        uptime: Math.floor((new Date().getTime() - start.getTime()) / 1000),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    })
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                },
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            },
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mustMatch: "extractgraph",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                mimetype: "application/ld+json",
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                addHeaders: {
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    "Cache-control": "max-age=3600, public",
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                },
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                async handle(content, searchParams: URLSearchParams) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    const url = searchParams.get("url")
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    console.log("URL", url)
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 15:21:04 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    if (cache[url] !== undefined) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        const { date, contents } = cache[url]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        // In seconds
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-27 03:07:48 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if (tdiff < 31 * 24 * 60 * 60) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-27 03:07:48 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            return JSON.stringify(contents)
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 15:21:04 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        }
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 18:58:34 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    }
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    let dloaded: { content: string } | { redirect: string } | "timeout" = {
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        redirect: url,
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    }
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    do {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        dloaded = await ServerLdScrape.attemptDownload(dloaded["redirect"])
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if (dloaded === "timeout") {
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            return '{"#":"timout reached"}'
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        }
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        if (dloaded === undefined) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            return undefined
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        }
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    } while (dloaded["redirect"])
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    if (dloaded["content"].startsWith("{")) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                        // This is probably a json
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        const snippet = JSON.parse(dloaded["content"])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        console.log("Snippet is", snippet)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        cache[url] = { contents: snippet, date: new Date() }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        return JSON.stringify(snippet)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                    const parsed = parse(dloaded["content"])
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    const scripts = Array.from(parsed.getElementsByTagName("script"))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    for (const script of scripts) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        const tp = script.attributes["type"]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        if (tp !== "application/ld+json") {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        try {
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 18:58:34 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            const snippet = JSON.parse(script.textContent)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            snippet["@base"] = url
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-27 03:07:48 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                            cache[url] = { contents: snippet, date: new Date() }
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 18:58:34 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            return JSON.stringify(snippet)
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        } catch (e) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                            console.error(e)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    }
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                },
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            },
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								new ServerLdScrape().run()
							 |