| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  | import Script from "../scripts/Script" | 
					
						
							| 
									
										
										
										
											2024-02-26 16:11:41 +01:00
										 |  |  | import { Server } from "./server" | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  | import parse from "node-html-parser" | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  | import ScriptUtils from "./ScriptUtils" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  | class ServerLdScrape extends Script { | 
					
						
							|  |  |  |     constructor() { | 
					
						
							|  |  |  |         super("Starts a server which fetches a webpage and returns embedded LD+JSON") | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |     private static async attemptDownload(url: string) { | 
					
						
							|  |  |  |         const host = new URL(url).host | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |         const random = Math.floor(Math.random() * 100) | 
					
						
							|  |  |  |         const random1 = Math.floor(Math.random() * 100) | 
					
						
							| 
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |         const headers = [ | 
					
						
							|  |  |  |             { | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                 "User-Agent": `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`, | 
					
						
							|  |  |  |                 accept: "application/html", | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |             /* { | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |                 "User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", | 
					
						
							|  |  |  |                 "accept": "application/html" | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 Host: host, | 
					
						
							|  |  |  |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0", | 
					
						
							| 
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 |  |  |                 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,* /*;q=0.8", TODO remove space in * /* | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |                 "Accept-Language": "en-US,en;q=0.5", | 
					
						
							|  |  |  |                 "Accept-Encoding": "gzip, deflate, br", | 
					
						
							|  |  |  |                 "Alt-Used": host, | 
					
						
							|  |  |  |                 DNT: 1, | 
					
						
							|  |  |  |                 "Sec-GPC": 1, | 
					
						
							|  |  |  |                 "Upgrade-Insecure-Requests": 1, | 
					
						
							|  |  |  |                 "Sec-Fetch-Dest": "document", | 
					
						
							|  |  |  |                 "Sec-Fetch-Mode": "navigate", | 
					
						
							|  |  |  |                 "Sec-Fetch-Site": "cross-site", | 
					
						
							|  |  |  |                 "Sec-Fetch-User":"?1", | 
					
						
							|  |  |  |                 "TE": "trailers", | 
					
						
							|  |  |  |                 Connection: "keep-alive" | 
					
						
							| 
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 |  |  |             }*/ | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |         ] | 
					
						
							|  |  |  |         for (let i = 0; i < headers.length; i++) { | 
					
						
							|  |  |  |             try { | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                 return await ScriptUtils.Download(url, headers[i], 10) | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |             } catch (e) { | 
					
						
							| 
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 |  |  |                 console.error("Could not download", url, "with headers", headers[i], "due to", e) | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  |     async main(args: string[]): Promise<void> { | 
					
						
							|  |  |  |         const port = Number(args[0] ?? 2346) | 
					
						
							| 
									
										
										
										
											2024-02-22 15:21:04 +01:00
										 |  |  |         const cache: Record<string, { date: Date; contents: any }> = {} | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  |         new Server(port, {}, [ | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 mustMatch: "extractgraph", | 
					
						
							|  |  |  |                 mimetype: "application/ld+json", | 
					
						
							| 
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 |  |  |                 addHeaders: { | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                     "Cache-control": "max-age=3600, public", | 
					
						
							| 
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 |  |  |                 }, | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  |                 async handle(content, searchParams: URLSearchParams) { | 
					
						
							|  |  |  |                     const url = searchParams.get("url") | 
					
						
							| 
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 |  |  |                     console.log("URL", url) | 
					
						
							| 
									
										
										
										
											2024-02-22 15:21:04 +01:00
										 |  |  |                     if (cache[url] !== undefined) { | 
					
						
							|  |  |  |                         const { date, contents } = cache[url] | 
					
						
							|  |  |  |                         // In seconds
 | 
					
						
							| 
									
										
										
										
											2024-02-27 03:07:48 +01:00
										 |  |  |                         const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000 | 
					
						
							| 
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 |  |  |                         if (tdiff < 31 * 24 * 60 * 60) { | 
					
						
							| 
									
										
										
										
											2024-02-27 03:07:48 +01:00
										 |  |  |                             return JSON.stringify(contents) | 
					
						
							| 
									
										
										
										
											2024-02-22 15:21:04 +01:00
										 |  |  |                         } | 
					
						
							| 
									
										
										
										
											2024-02-22 18:58:34 +01:00
										 |  |  |                     } | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                     let dloaded: { content: string } | { redirect: string } | "timeout" = { | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                         redirect: url, | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                     } | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  |                     do { | 
					
						
							| 
									
										
										
										
											2024-04-24 00:58:22 +02:00
										 |  |  |                         dloaded = await ServerLdScrape.attemptDownload(dloaded["redirect"]) | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  |                         if (dloaded === "timeout") { | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                             return '{"#":"timout reached"}' | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  |                         } | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                         if (dloaded === undefined) { | 
					
						
							| 
									
										
										
										
											2024-04-30 15:59:19 +02:00
										 |  |  |                             return undefined | 
					
						
							|  |  |  |                         } | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  |                     } while (dloaded["redirect"]) | 
					
						
							| 
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                     if (dloaded["content"].startsWith("{")) { | 
					
						
							| 
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 |  |  |                         // This is probably a json
 | 
					
						
							|  |  |  |                         const snippet = JSON.parse(dloaded["content"]) | 
					
						
							|  |  |  |                         console.log("Snippet is", snippet) | 
					
						
							|  |  |  |                         cache[url] = { contents: snippet, date: new Date() } | 
					
						
							|  |  |  |                         return JSON.stringify(snippet) | 
					
						
							|  |  |  |                     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-03-01 00:50:00 +01:00
										 |  |  |                     const parsed = parse(dloaded["content"]) | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  |                     const scripts = Array.from(parsed.getElementsByTagName("script")) | 
					
						
							|  |  |  |                     for (const script of scripts) { | 
					
						
							|  |  |  |                         const tp = script.attributes["type"] | 
					
						
							|  |  |  |                         if (tp !== "application/ld+json") { | 
					
						
							|  |  |  |                             continue | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                         try { | 
					
						
							| 
									
										
										
										
											2024-02-22 18:58:34 +01:00
										 |  |  |                             const snippet = JSON.parse(script.textContent) | 
					
						
							|  |  |  |                             snippet["@base"] = url | 
					
						
							| 
									
										
										
										
											2024-02-27 03:07:48 +01:00
										 |  |  |                             cache[url] = { contents: snippet, date: new Date() } | 
					
						
							| 
									
										
										
										
											2024-02-22 18:58:34 +01:00
										 |  |  | 
 | 
					
						
							|  |  |  |                             return JSON.stringify(snippet) | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  |                         } catch (e) { | 
					
						
							|  |  |  |                             console.error(e) | 
					
						
							|  |  |  |                         } | 
					
						
							|  |  |  |                     } | 
					
						
							| 
									
										
										
										
											2024-06-16 16:06:26 +02:00
										 |  |  |                 }, | 
					
						
							|  |  |  |             }, | 
					
						
							| 
									
										
										
										
											2024-02-22 14:59:05 +01:00
										 |  |  |         ]) | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | new ServerLdScrape().run() |