| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  | import fs from "fs" | 
					
						
							| 
									
										
										
										
											2024-02-29 14:54:14 +01:00
										 |  |  | import readline from "readline" | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  | import Script from "../Script" | 
					
						
							|  |  |  | import LinkedDataLoader from "../../src/Logic/Web/LinkedDataLoader" | 
					
						
							|  |  |  | import UrlValidator from "../../src/UI/InputElement/Validators/UrlValidator" | 
					
						
							|  |  |  | // vite-node scripts/importscripts/compareWebsiteData.ts -- ~/Downloads/ShopsWithWebsiteNodes.csv ~/data/scraped_websites/
 | 
					
						
							|  |  |  | class CompareWebsiteData extends Script { | 
					
						
							|  |  |  |     constructor() { | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |         super( | 
					
						
							|  |  |  |             "Given a csv file with 'id', 'tags' and 'website', attempts to fetch jsonld and compares the attributes. Usage: csv-file datadir" | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     private readonly urlFormatter = new UrlValidator() | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |     async getWithCache(cachedir: string, url: string): Promise<any> { | 
					
						
							|  |  |  |         const filename = cachedir + "/" + encodeURIComponent(url) | 
					
						
							|  |  |  |         if (fs.existsSync(filename)) { | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |             return JSON.parse(fs.readFileSync(filename, "utf-8")) | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 |  |  |         const jsonLd = await LinkedDataLoader.fetchJsonLd(url, undefined, true) | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |         console.log("Got:", jsonLd) | 
					
						
							|  |  |  |         fs.writeFileSync(filename, JSON.stringify(jsonLd)) | 
					
						
							|  |  |  |         return jsonLd | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |     async handleEntry(line: string, cachedir: string, targetfile: string): Promise<boolean> { | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |         const id = JSON.parse(line.split(",")[0]) | 
					
						
							|  |  |  |         let tags = line.substring(line.indexOf("{") - 1) | 
					
						
							|  |  |  |         tags = tags.substring(1, tags.length - 1) | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |         tags = tags.replace(/""/g, '"') | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |         const data = JSON.parse(tags) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |         try { | 
					
						
							|  |  |  |             const website = this.urlFormatter.reformat(data.website) | 
					
						
							|  |  |  |             console.log(website) | 
					
						
							|  |  |  |             const jsonld = await this.getWithCache(cachedir, website) | 
					
						
							|  |  |  |             if (Object.keys(jsonld).length === 0) { | 
					
						
							|  |  |  |                 return false | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |             const diff = LinkedDataLoader.removeDuplicateData(jsonld, data) | 
					
						
							|  |  |  |             fs.appendFileSync(targetfile, id + ", " + JSON.stringify(diff) + "\n\n") | 
					
						
							|  |  |  |             return true | 
					
						
							|  |  |  |         } catch (e) { | 
					
						
							| 
									
										
										
										
											2024-02-29 14:54:14 +01:00
										 |  |  |             console.error("Could not download ", data.website) | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     async main(args: string[]): Promise<void> { | 
					
						
							|  |  |  |         if (args.length < 2) { | 
					
						
							|  |  |  |             throw "Not enough arguments" | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         const readInterface = readline.createInterface({ | 
					
						
							|  |  |  |             input: fs.createReadStream(args[0]), | 
					
						
							|  |  |  |         }) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         let handled = 0 | 
					
						
							|  |  |  |         let diffed = 0 | 
					
						
							|  |  |  |         const targetfile = "diff.csv" | 
					
						
							|  |  |  |         fs.writeFileSync(targetfile, "id, diff-json\n") | 
					
						
							|  |  |  |         for await (const line of readInterface) { | 
					
						
							|  |  |  |             try { | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                 if (line.startsWith('"id"')) { | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |                     continue | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 const madeComparison = await this.handleEntry(line, args[1], targetfile) | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                 handled++ | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |                 diffed = diffed + (madeComparison ? 1 : 0) | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                 if (handled % 1000 == 0) { | 
					
						
							|  |  |  |                     console.log("Handled ", handled, " got ", diffed, "diff results") | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |                 } | 
					
						
							|  |  |  |             } catch (e) { | 
					
						
							| 
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 |  |  |                 // console.error(e)
 | 
					
						
							| 
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | new CompareWebsiteData().run() |