Improve scraping server

This commit is contained in:
Pieter Vander Vennet 2024-04-30 15:59:19 +02:00
parent bf523848fb
commit 5759d4f8a7
2 changed files with 20 additions and 7 deletions

View file

@ -91,6 +91,12 @@ export class Server {
try {
const result = await handler.handle(path, url.searchParams)
if(result === undefined){
res.writeHead(500)
res.write("Could not fetch this website, probably blocked by them")
res.end()
return
}
if (typeof result !== "string") {
console.error(
"Internal server error: handling",

View file

@ -4,26 +4,30 @@ import parse from "node-html-parser"
import ScriptUtils from "./ScriptUtils"
class ServerLdScrape extends Script {
constructor() {
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
}
private static async attemptDownload(url: string) {
const host = new URL(url).host
const random = Math.floor(Math.random()*100)
const random1 = Math.floor(Math.random()*100)
const headers = [
{
"User-Agent":
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36",
`Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`,
"accept": "application/html"
},
{
}
/* {
"User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
"accept": "application/html"
},
{
Host: host,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,* /*;q=0.8", TODO remove space in * /*
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Alt-Used": host,
@ -36,7 +40,7 @@ class ServerLdScrape extends Script {
"Sec-Fetch-User":"?1",
"TE": "trailers",
Connection: "keep-alive"
}
}*/
]
for (let i = 0; i < headers.length; i++) {
try {
@ -47,7 +51,7 @@ class ServerLdScrape extends Script {
10
)
} catch (e) {
console.error("Could not download", url, "with headers", headers[i])
console.error("Could not download", url, "with headers", headers[i], "due to", e)
}
}
}
@ -70,7 +74,7 @@ class ServerLdScrape extends Script {
console.log(">>>", date, contents)
// In seconds
const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000
if (tdiff < 24 * 60 * 60) {
if (tdiff < 31 * 24 * 60 * 60) {
return JSON.stringify(contents)
}
}
@ -83,6 +87,9 @@ class ServerLdScrape extends Script {
if (dloaded === "timeout") {
return "{\"#\":\"timout reached\"}"
}
if(dloaded === undefined){
return undefined
}
} while (dloaded["redirect"])
if (dloaded["content"].startsWith("{")) {