forked from MapComplete/MapComplete
		
	Add module to fetch data (via a proxy) from the website with jsonld
This commit is contained in:
		
							parent
							
								
									1b06eee15b
								
							
						
					
					
						commit
						352414b29d
					
				
					 17 changed files with 388 additions and 351 deletions
				
			
		
							
								
								
									
										80
									
								
								scripts/importscripts/compareWebsiteData.ts
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								scripts/importscripts/compareWebsiteData.ts
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,80 @@ | |||
| import fs from "fs" | ||||
| // import readline from "readline"
 | ||||
| import Script from "../Script" | ||||
| import LinkedDataLoader from "../../src/Logic/Web/LinkedDataLoader" | ||||
| import UrlValidator from "../../src/UI/InputElement/Validators/UrlValidator" | ||||
| // vite-node scripts/importscripts/compareWebsiteData.ts -- ~/Downloads/ShopsWithWebsiteNodes.csv ~/data/scraped_websites/
 | ||||
| /* | ||||
| class CompareWebsiteData extends Script { | ||||
|     constructor() { | ||||
|         super("Given a csv file with 'id', 'tags' and 'website', attempts to fetch jsonld and compares the attributes. Usage: csv-file datadir") | ||||
|     } | ||||
| 
 | ||||
|     private readonly urlFormatter = new UrlValidator() | ||||
|     async getWithCache(cachedir : string, url: string): Promise<any>{ | ||||
|         const filename=  cachedir+"/"+encodeURIComponent(url) | ||||
|         if(fs.existsSync(filename)){ | ||||
|             return JSON.parse(fs.readFileSync(filename, "utf-8")) | ||||
|         } | ||||
|         const jsonLd = await LinkedDataLoader.fetchJsonLdWithProxy(url) | ||||
|         console.log("Got:", jsonLd) | ||||
|         fs.writeFileSync(filename, JSON.stringify(jsonLd)) | ||||
|         return jsonLd | ||||
|     } | ||||
|     async handleEntry(line: string, cachedir: string, targetfile: string) : Promise<boolean>{ | ||||
|         const id = JSON.parse(line.split(",")[0]) | ||||
|         let tags = line.substring(line.indexOf("{") - 1) | ||||
|         tags = tags.substring(1, tags.length - 1) | ||||
|         tags = tags.replace(/""/g, "\"") | ||||
|         const data = JSON.parse(tags) | ||||
| 
 | ||||
|         const website = data.website //this.urlFormatter.reformat(data.website)
 | ||||
|         if(!website.startsWith("https://stores.delhaize.be")){ | ||||
|             return false | ||||
|         } | ||||
|         console.log(website) | ||||
|         const jsonld = await this.getWithCache(cachedir, website) | ||||
|         console.log(jsonld) | ||||
|         if(Object.keys(jsonld).length === 0){ | ||||
|             return false | ||||
|         } | ||||
|         const diff = LinkedDataLoader.removeDuplicateData(jsonld, data) | ||||
|         fs.appendFileSync(targetfile, id +", "+ JSON.stringify(diff)+"\n") | ||||
|         return true | ||||
|     } | ||||
| 
 | ||||
|     async main(args: string[]): Promise<void> { | ||||
|         if (args.length < 2) { | ||||
|             throw "Not enough arguments" | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
|         const readInterface = readline.createInterface({ | ||||
|             input: fs.createReadStream(args[0]), | ||||
|         }) | ||||
| 
 | ||||
|         let handled = 0 | ||||
|         let diffed = 0 | ||||
|         const targetfile = "diff.csv" | ||||
|         fs.writeFileSync(targetfile, "id, diff-json\n") | ||||
|         for await (const line of readInterface) { | ||||
|             try { | ||||
|                 if(line.startsWith("\"id\"")){ | ||||
|                     continue | ||||
|                 } | ||||
|                 const madeComparison = await this.handleEntry(line, args[1], targetfile) | ||||
|                 handled ++ | ||||
|                 diffed = diffed + (madeComparison ? 1 : 0) | ||||
|                 if(handled % 1000 == 0){ | ||||
|                  //   console.log("Handled ",handled," got ",diffed,"diff results")
 | ||||
|                 } | ||||
|             } catch (e) { | ||||
|                // console.error(e)
 | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| new CompareWebsiteData().run() | ||||
| */ | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue