Improve scraping server

This commit is contained in:
Pieter Vander Vennet 2024-04-30 15:59:19 +02:00
parent bf523848fb
commit 5759d4f8a7
2 changed files with 20 additions and 7 deletions

View file

@ -91,6 +91,12 @@ export class Server {
try { try {
const result = await handler.handle(path, url.searchParams) const result = await handler.handle(path, url.searchParams)
if(result === undefined){
res.writeHead(500)
res.write("Could not fetch this website, probably blocked by them")
res.end()
return
}
if (typeof result !== "string") { if (typeof result !== "string") {
console.error( console.error(
"Internal server error: handling", "Internal server error: handling",

View file

@ -4,26 +4,30 @@ import parse from "node-html-parser"
import ScriptUtils from "./ScriptUtils" import ScriptUtils from "./ScriptUtils"
class ServerLdScrape extends Script { class ServerLdScrape extends Script {
constructor() { constructor() {
super("Starts a server which fetches a webpage and returns embedded LD+JSON") super("Starts a server which fetches a webpage and returns embedded LD+JSON")
} }
private static async attemptDownload(url: string) { private static async attemptDownload(url: string) {
const host = new URL(url).host const host = new URL(url).host
const random = Math.floor(Math.random()*100)
const random1 = Math.floor(Math.random()*100)
const headers = [ const headers = [
{ {
"User-Agent": "User-Agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`,
"accept": "application/html" "accept": "application/html"
}, }
{ /* {
"User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", "User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
"accept": "application/html" "accept": "application/html"
}, },
{ {
Host: host, Host: host,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,* /*;q=0.8", TODO remove space in * /*
"Accept-Language": "en-US,en;q=0.5", "Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br", "Accept-Encoding": "gzip, deflate, br",
"Alt-Used": host, "Alt-Used": host,
@ -36,7 +40,7 @@ class ServerLdScrape extends Script {
"Sec-Fetch-User":"?1", "Sec-Fetch-User":"?1",
"TE": "trailers", "TE": "trailers",
Connection: "keep-alive" Connection: "keep-alive"
} }*/
] ]
for (let i = 0; i < headers.length; i++) { for (let i = 0; i < headers.length; i++) {
try { try {
@ -47,7 +51,7 @@ class ServerLdScrape extends Script {
10 10
) )
} catch (e) { } catch (e) {
console.error("Could not download", url, "with headers", headers[i]) console.error("Could not download", url, "with headers", headers[i], "due to", e)
} }
} }
} }
@ -70,7 +74,7 @@ class ServerLdScrape extends Script {
console.log(">>>", date, contents) console.log(">>>", date, contents)
// In seconds // In seconds
const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000 const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000
if (tdiff < 24 * 60 * 60) { if (tdiff < 31 * 24 * 60 * 60) {
return JSON.stringify(contents) return JSON.stringify(contents)
} }
} }
@ -83,6 +87,9 @@ class ServerLdScrape extends Script {
if (dloaded === "timeout") { if (dloaded === "timeout") {
return "{\"#\":\"timout reached\"}" return "{\"#\":\"timout reached\"}"
} }
if(dloaded === undefined){
return undefined
}
} while (dloaded["redirect"]) } while (dloaded["redirect"])
if (dloaded["content"].startsWith("{")) { if (dloaded["content"].startsWith("{")) {