forked from MapComplete/MapComplete
Attempts to scrape more data
This commit is contained in:
parent
5bebcf3825
commit
73db05f545
2 changed files with 54 additions and 14 deletions
|
@ -176,7 +176,9 @@ export default class ScriptUtils {
|
||||||
const requestPromise = new Promise((resolve, reject) => {
|
const requestPromise = new Promise((resolve, reject) => {
|
||||||
try {
|
try {
|
||||||
headers = headers ?? {}
|
headers = headers ?? {}
|
||||||
headers.accept ??= "application/json"
|
if(!headers.Accept){
|
||||||
|
headers.accept ??= "application/json"
|
||||||
|
}
|
||||||
console.log(" > ScriptUtils.Download(", url, ")")
|
console.log(" > ScriptUtils.Download(", url, ")")
|
||||||
const urlObj = new URL(url)
|
const urlObj = new URL(url)
|
||||||
const request = https.get(
|
const request = https.get(
|
||||||
|
|
|
@ -8,6 +8,50 @@ class ServerLdScrape extends Script {
|
||||||
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
|
super("Starts a server which fetches a webpage and returns embedded LD+JSON")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static async attemptDownload(url: string) {
|
||||||
|
const host = new URL(url).host
|
||||||
|
const headers = [
|
||||||
|
{
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36",
|
||||||
|
"accept": "application/html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
|
||||||
|
"accept": "application/html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Host: host,
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Alt-Used": host,
|
||||||
|
DNT: 1,
|
||||||
|
"Sec-GPC": 1,
|
||||||
|
"Upgrade-Insecure-Requests": 1,
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "cross-site",
|
||||||
|
"Sec-Fetch-User":"?1",
|
||||||
|
"TE": "trailers",
|
||||||
|
Connection: "keep-alive"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
for (let i = 0; i < headers.length; i++) {
|
||||||
|
try {
|
||||||
|
|
||||||
|
return await ScriptUtils.Download(
|
||||||
|
url,
|
||||||
|
headers[i],
|
||||||
|
10
|
||||||
|
)
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Could not download", url, "with headers", headers[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async main(args: string[]): Promise<void> {
|
async main(args: string[]): Promise<void> {
|
||||||
const port = Number(args[0] ?? 2346)
|
const port = Number(args[0] ?? 2346)
|
||||||
const cache: Record<string, { date: Date; contents: any }> = {}
|
const cache: Record<string, { date: Date; contents: any }> = {}
|
||||||
|
@ -16,7 +60,7 @@ class ServerLdScrape extends Script {
|
||||||
mustMatch: "extractgraph",
|
mustMatch: "extractgraph",
|
||||||
mimetype: "application/ld+json",
|
mimetype: "application/ld+json",
|
||||||
addHeaders: {
|
addHeaders: {
|
||||||
"Cache-control": "max-age=3600, public",
|
"Cache-control": "max-age=3600, public"
|
||||||
},
|
},
|
||||||
async handle(content, searchParams: URLSearchParams) {
|
async handle(content, searchParams: URLSearchParams) {
|
||||||
const url = searchParams.get("url")
|
const url = searchParams.get("url")
|
||||||
|
@ -31,19 +75,13 @@ class ServerLdScrape extends Script {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let dloaded: { content: string } | { redirect: string } | "timeout" = {
|
let dloaded: { content: string } | { redirect: string } | "timeout" = {
|
||||||
redirect: url,
|
redirect: url
|
||||||
}
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
dloaded = await ScriptUtils.Download(
|
dloaded = await ServerLdScrape.attemptDownload(dloaded["redirect"])
|
||||||
dloaded["redirect"],
|
|
||||||
{
|
|
||||||
"User-Agent":
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", // MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
|
|
||||||
},
|
|
||||||
10
|
|
||||||
)
|
|
||||||
if (dloaded === "timeout") {
|
if (dloaded === "timeout") {
|
||||||
return '{"#":"timout reached"}'
|
return "{\"#\":\"timout reached\"}"
|
||||||
}
|
}
|
||||||
} while (dloaded["redirect"])
|
} while (dloaded["redirect"])
|
||||||
|
|
||||||
|
@ -72,8 +110,8 @@ class ServerLdScrape extends Script {
|
||||||
console.error(e)
|
console.error(e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
},
|
}
|
||||||
])
|
])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue