From 73db05f5454f0d8021254c35041799e1d4b34ba7 Mon Sep 17 00:00:00 2001 From: Pieter Vander Vennet Date: Wed, 24 Apr 2024 00:58:22 +0200 Subject: [PATCH] Attempts to scrape more data --- scripts/ScriptUtils.ts | 4 ++- scripts/serverLdScrape.ts | 64 +++++++++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/scripts/ScriptUtils.ts b/scripts/ScriptUtils.ts index f7a5e0d59f..6393f75d43 100644 --- a/scripts/ScriptUtils.ts +++ b/scripts/ScriptUtils.ts @@ -176,7 +176,9 @@ export default class ScriptUtils { const requestPromise = new Promise((resolve, reject) => { try { headers = headers ?? {} - headers.accept ??= "application/json" + if(!headers.Accept){ + headers.accept ??= "application/json" + } console.log(" > ScriptUtils.Download(", url, ")") const urlObj = new URL(url) const request = https.get( diff --git a/scripts/serverLdScrape.ts b/scripts/serverLdScrape.ts index a5bd6c8c32..fd5123d096 100644 --- a/scripts/serverLdScrape.ts +++ b/scripts/serverLdScrape.ts @@ -8,6 +8,50 @@ class ServerLdScrape extends Script { super("Starts a server which fetches a webpage and returns embedded LD+JSON") } + private static async attemptDownload(url: string) { + const host = new URL(url).host + const headers = [ + { + "User-Agent": + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", + "accept": "application/html" + }, + { + "User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", + "accept": "application/html" + }, + { + Host: host, + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Alt-Used": host, + DNT: 1, + "Sec-GPC": 1, + "Upgrade-Insecure-Requests": 1, + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "cross-site", + "Sec-Fetch-User":"?1", + "TE": "trailers", + Connection: "keep-alive" + } + ] + for (let i = 0; i < headers.length; i++) { + try { + + return await ScriptUtils.Download( + url, + headers[i], + 10 + ) + } catch (e) { + console.error("Could not download", url, "with headers", headers[i]) + } + } + } + async main(args: string[]): Promise { const port = Number(args[0] ?? 2346) const cache: Record = {} @@ -16,7 +60,7 @@ class ServerLdScrape extends Script { mustMatch: "extractgraph", mimetype: "application/ld+json", addHeaders: { - "Cache-control": "max-age=3600, public", + "Cache-control": "max-age=3600, public" }, async handle(content, searchParams: URLSearchParams) { const url = searchParams.get("url") @@ -31,19 +75,13 @@ class ServerLdScrape extends Script { } } let dloaded: { content: string } | { redirect: string } | "timeout" = { - redirect: url, + redirect: url } + do { - dloaded = await ScriptUtils.Download( - dloaded["redirect"], - { - "User-Agent": - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", // MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", - }, - 10 - ) + dloaded = await ServerLdScrape.attemptDownload(dloaded["redirect"]) if (dloaded === "timeout") { - return '{"#":"timout reached"}' + return "{\"#\":\"timout reached\"}" } } while (dloaded["redirect"]) @@ -72,8 +110,8 @@ class ServerLdScrape extends Script { console.error(e) } } - }, - }, + } + } ]) } }