forked from MapComplete/MapComplete
		
	Attempts to scrape more data
This commit is contained in:
		
							parent
							
								
									5bebcf3825
								
							
						
					
					
						commit
						73db05f545
					
				
					 2 changed files with 54 additions and 14 deletions
				
			
		|  | @ -176,7 +176,9 @@ export default class ScriptUtils { | ||||||
|         const requestPromise = new Promise((resolve, reject) => { |         const requestPromise = new Promise((resolve, reject) => { | ||||||
|             try { |             try { | ||||||
|                 headers = headers ?? {} |                 headers = headers ?? {} | ||||||
|  |                 if(!headers.Accept){ | ||||||
|                     headers.accept ??= "application/json" |                     headers.accept ??= "application/json" | ||||||
|  |                 } | ||||||
|                 console.log(" > ScriptUtils.Download(", url, ")") |                 console.log(" > ScriptUtils.Download(", url, ")") | ||||||
|                 const urlObj = new URL(url) |                 const urlObj = new URL(url) | ||||||
|                 const request = https.get( |                 const request = https.get( | ||||||
|  |  | ||||||
|  | @ -8,6 +8,50 @@ class ServerLdScrape extends Script { | ||||||
|         super("Starts a server which fetches a webpage and returns embedded LD+JSON") |         super("Starts a server which fetches a webpage and returns embedded LD+JSON") | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     private static async attemptDownload(url: string) { | ||||||
|  |         const host = new URL(url).host | ||||||
|  |         const headers = [ | ||||||
|  |             { | ||||||
|  |                 "User-Agent": | ||||||
|  |                     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", | ||||||
|  |                 "accept": "application/html" | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "User-Agent": "MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete", | ||||||
|  |                 "accept": "application/html" | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 Host: host, | ||||||
|  |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0", | ||||||
|  |                 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | ||||||
|  |                 "Accept-Language": "en-US,en;q=0.5", | ||||||
|  |                 "Accept-Encoding": "gzip, deflate, br", | ||||||
|  |                 "Alt-Used": host, | ||||||
|  |                 DNT: 1, | ||||||
|  |                 "Sec-GPC": 1, | ||||||
|  |                 "Upgrade-Insecure-Requests": 1, | ||||||
|  |                 "Sec-Fetch-Dest": "document", | ||||||
|  |                 "Sec-Fetch-Mode": "navigate", | ||||||
|  |                 "Sec-Fetch-Site": "cross-site", | ||||||
|  |                 "Sec-Fetch-User":"?1", | ||||||
|  |                 "TE": "trailers", | ||||||
|  |                 Connection: "keep-alive" | ||||||
|  |             } | ||||||
|  |         ] | ||||||
|  |         for (let i = 0; i < headers.length; i++) { | ||||||
|  |             try { | ||||||
|  | 
 | ||||||
|  |                 return await ScriptUtils.Download( | ||||||
|  |                     url, | ||||||
|  |                     headers[i], | ||||||
|  |                     10 | ||||||
|  |                 ) | ||||||
|  |             } catch (e) { | ||||||
|  |                 console.error("Could not download", url, "with headers", headers[i]) | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     async main(args: string[]): Promise<void> { |     async main(args: string[]): Promise<void> { | ||||||
|         const port = Number(args[0] ?? 2346) |         const port = Number(args[0] ?? 2346) | ||||||
|         const cache: Record<string, { date: Date; contents: any }> = {} |         const cache: Record<string, { date: Date; contents: any }> = {} | ||||||
|  | @ -16,7 +60,7 @@ class ServerLdScrape extends Script { | ||||||
|                 mustMatch: "extractgraph", |                 mustMatch: "extractgraph", | ||||||
|                 mimetype: "application/ld+json", |                 mimetype: "application/ld+json", | ||||||
|                 addHeaders: { |                 addHeaders: { | ||||||
|                     "Cache-control": "max-age=3600, public", |                     "Cache-control": "max-age=3600, public" | ||||||
|                 }, |                 }, | ||||||
|                 async handle(content, searchParams: URLSearchParams) { |                 async handle(content, searchParams: URLSearchParams) { | ||||||
|                     const url = searchParams.get("url") |                     const url = searchParams.get("url") | ||||||
|  | @ -31,19 +75,13 @@ class ServerLdScrape extends Script { | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                     let dloaded: { content: string } | { redirect: string } | "timeout" = { |                     let dloaded: { content: string } | { redirect: string } | "timeout" = { | ||||||
|                         redirect: url, |                         redirect: url | ||||||
|                     } |                     } | ||||||
|  | 
 | ||||||
|                     do { |                     do { | ||||||
|                         dloaded = await ScriptUtils.Download( |                         dloaded = await ServerLdScrape.attemptDownload(dloaded["redirect"]) | ||||||
|                             dloaded["redirect"], |  | ||||||
|                             { |  | ||||||
|                                 "User-Agent": |  | ||||||
|                                     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36", // MapComplete/openstreetmap scraper; pietervdvn@posteo.net; https://github.com/pietervdvn/MapComplete",
 |  | ||||||
|                             }, |  | ||||||
|                             10 |  | ||||||
|                         ) |  | ||||||
|                         if (dloaded === "timeout") { |                         if (dloaded === "timeout") { | ||||||
|                             return '{"#":"timout reached"}' |                             return "{\"#\":\"timout reached\"}" | ||||||
|                         } |                         } | ||||||
|                     } while (dloaded["redirect"]) |                     } while (dloaded["redirect"]) | ||||||
| 
 | 
 | ||||||
|  | @ -72,8 +110,8 @@ class ServerLdScrape extends Script { | ||||||
|                             console.error(e) |                             console.error(e) | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 }, |                 } | ||||||
|             }, |             } | ||||||
|         ]) |         ]) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue