2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import fs from "fs"
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-29 14:54:14 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								import readline from "readline"
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import Script from "../Script"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import LinkedDataLoader from "../../src/Logic/Web/LinkedDataLoader"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import UrlValidator from "../../src/UI/InputElement/Validators/UrlValidator"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								// vite-node scripts/importscripts/compareWebsiteData.ts -- ~/Downloads/ShopsWithWebsiteNodes.csv ~/data/scraped_websites/
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								class CompareWebsiteData extends Script {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    constructor() {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        super(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            "Given a csv file with 'id', 'tags' and 'website', attempts to fetch jsonld and compares the attributes. Usage: csv-file datadir"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        )
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    private readonly urlFormatter = new UrlValidator()
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    async getWithCache(cachedir: string, url: string): Promise<any> {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const filename = cachedir + "/" + encodeURIComponent(url)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if (fs.existsSync(filename)) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return JSON.parse(fs.readFileSync(filename, "utf-8"))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-05 17:49:31 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        const jsonLd = await LinkedDataLoader.fetchJsonLd(url, undefined, true)
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        console.log("Got:", jsonLd)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fs.writeFileSync(filename, JSON.stringify(jsonLd))
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        return jsonLd
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    async handleEntry(line: string, cachedir: string, targetfile: string): Promise<boolean> {
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const id = JSON.parse(line.split(",")[0])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        let tags = line.substring(line.indexOf("{") - 1)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        tags = tags.substring(1, tags.length - 1)
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        tags = tags.replace(/""/g, '"')
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const data = JSON.parse(tags)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        try {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            const website = this.urlFormatter.reformat(data.website)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            console.log(website)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            const jsonld = await this.getWithCache(cachedir, website)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if (Object.keys(jsonld).length === 0) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                return false
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            const diff = LinkedDataLoader.removeDuplicateData(jsonld, data)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            fs.appendFileSync(targetfile, id + ", " + JSON.stringify(diff) + "\n\n")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            return true
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        } catch (e) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-29 14:54:14 +01:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            console.error("Could not download ", data.website)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    async main(args: string[]): Promise<void> {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if (args.length < 2) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            throw "Not enough arguments"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const readInterface = readline.createInterface({
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            input: fs.createReadStream(args[0]),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        })
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        let handled = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        let diffed = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        const targetfile = "diff.csv"
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        fs.writeFileSync(targetfile, "id, diff-json\n")
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for await (const line of readInterface) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            try {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if (line.startsWith('"id"')) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    continue
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                const madeComparison = await this.handleEntry(line, args[1], targetfile)
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                handled++
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                diffed = diffed + (madeComparison ? 1 : 0)
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                if (handled % 1000 == 0) {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                    console.log("Handled ", handled, " got ", diffed, "diff results")
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            } catch (e) {
							 | 
						
					
						
							
								
									
										
										
										
											2024-04-13 02:40:21 +02:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                // console.error(e)
							 | 
						
					
						
							
								
									
										
										
										
											2024-02-26 02:24:46 +01:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								new CompareWebsiteData().run()
							 |