From 5759d4f8a7a8348928f6208fc0a37bcab1b656db Mon Sep 17 00:00:00 2001 From: Pieter Vander Vennet Date: Tue, 30 Apr 2024 15:59:19 +0200 Subject: [PATCH] Improve scraping server --- scripts/server.ts | 6 ++++++ scripts/serverLdScrape.ts | 21 ++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/scripts/server.ts b/scripts/server.ts index 6f85d762eb..0b7d40af76 100644 --- a/scripts/server.ts +++ b/scripts/server.ts @@ -91,6 +91,12 @@ export class Server { try { const result = await handler.handle(path, url.searchParams) + if(result === undefined){ + res.writeHead(500) + res.write("Could not fetch this website, probably blocked by them") + res.end() + return + } if (typeof result !== "string") { console.error( "Internal server error: handling", diff --git a/scripts/serverLdScrape.ts b/scripts/serverLdScrape.ts index fd5123d096..094b3d2716 100644 --- a/scripts/serverLdScrape.ts +++ b/scripts/serverLdScrape.ts @@ -4,26 +4,30 @@ import parse from "node-html-parser" import ScriptUtils from "./ScriptUtils" class ServerLdScrape extends Script { + constructor() { super("Starts a server which fetches a webpage and returns embedded LD+JSON") } private static async attemptDownload(url: string) { const host = new URL(url).host + const random = Math.floor(Math.random()*100) + const random1 = Math.floor(Math.random()*100) + const headers = [ { "User-Agent": - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", + `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`, "accept": "application/html" - }, - { + } + /* { "User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", "accept": "application/html" }, { Host: host, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,* /*;q=0.8", TODO remove space in * /* "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "Alt-Used": host, @@ -36,7 +40,7 @@ class ServerLdScrape extends Script { "Sec-Fetch-User":"?1", "TE": "trailers", Connection: "keep-alive" - } + }*/ ] for (let i = 0; i < headers.length; i++) { try { @@ -47,7 +51,7 @@ class ServerLdScrape extends Script { 10 ) } catch (e) { - console.error("Could not download", url, "with headers", headers[i]) + console.error("Could not download", url, "with headers", headers[i], "due to", e) } } } @@ -70,7 +74,7 @@ class ServerLdScrape extends Script { console.log(">>>", date, contents) // In seconds const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000 - if (tdiff < 24 * 60 * 60) { + if (tdiff < 31 * 24 * 60 * 60) { return JSON.stringify(contents) } } @@ -83,6 +87,9 @@ class ServerLdScrape extends Script { if (dloaded === "timeout") { return "{\"#\":\"timout reached\"}" } + if(dloaded === undefined){ + return undefined + } } while (dloaded["redirect"]) if (dloaded["content"].startsWith("{")) {