forked from MapComplete/MapComplete
Improve scraping server
This commit is contained in:
parent
bf523848fb
commit
5759d4f8a7
2 changed files with 20 additions and 7 deletions
scripts
|
@ -91,6 +91,12 @@ export class Server {
|
|||
|
||||
try {
|
||||
const result = await handler.handle(path, url.searchParams)
|
||||
if(result === undefined){
|
||||
res.writeHead(500)
|
||||
res.write("Could not fetch this website, probably blocked by them")
|
||||
res.end()
|
||||
return
|
||||
}
|
||||
if (typeof result !== "string") {
|
||||
console.error(
|
||||
"Internal server error: handling",
|
||||
|
|
|
@ -4,26 +4,30 @@ import parse from "node-html-parser"
|
|||
import ScriptUtils from "./ScriptUtils"
|
||||
|
||||
class ServerLdScrape extends Script {
|
||||
|
||||
constructor() {
|
||||
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
|
||||
}
|
||||
|
||||
private static async attemptDownload(url: string) {
|
||||
const host = new URL(url).host
|
||||
const random = Math.floor(Math.random()*100)
|
||||
const random1 = Math.floor(Math.random()*100)
|
||||
|
||||
const headers = [
|
||||
{
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36",
|
||||
`Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.${random}.${random1} Safari/537.36`,
|
||||
"accept": "application/html"
|
||||
},
|
||||
{
|
||||
}
|
||||
/* {
|
||||
"User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
|
||||
"accept": "application/html"
|
||||
},
|
||||
{
|
||||
Host: host,
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,* /*;q=0.8", TODO remove space in * /*
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Alt-Used": host,
|
||||
|
@ -36,7 +40,7 @@ class ServerLdScrape extends Script {
|
|||
"Sec-Fetch-User":"?1",
|
||||
"TE": "trailers",
|
||||
Connection: "keep-alive"
|
||||
}
|
||||
}*/
|
||||
]
|
||||
for (let i = 0; i < headers.length; i++) {
|
||||
try {
|
||||
|
@ -47,7 +51,7 @@ class ServerLdScrape extends Script {
|
|||
10
|
||||
)
|
||||
} catch (e) {
|
||||
console.error("Could not download", url, "with headers", headers[i])
|
||||
console.error("Could not download", url, "with headers", headers[i], "due to", e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -70,7 +74,7 @@ class ServerLdScrape extends Script {
|
|||
console.log(">>>", date, contents)
|
||||
// In seconds
|
||||
const tdiff = (new Date().getTime() - (date?.getTime() ?? 0)) / 1000
|
||||
if (tdiff < 24 * 60 * 60) {
|
||||
if (tdiff < 31 * 24 * 60 * 60) {
|
||||
return JSON.stringify(contents)
|
||||
}
|
||||
}
|
||||
|
@ -83,6 +87,9 @@ class ServerLdScrape extends Script {
|
|||
if (dloaded === "timeout") {
|
||||
return "{\"#\":\"timout reached\"}"
|
||||
}
|
||||
if(dloaded === undefined){
|
||||
return undefined
|
||||
}
|
||||
} while (dloaded["redirect"])
|
||||
|
||||
if (dloaded["content"].startsWith("{")) {
|
||||
|
|
Loading…
Add table
Reference in a new issue