Add module to fetch data (via a proxy) from the website with jsonld

This commit is contained in:
Pieter Vander Vennet 2024-02-26 02:24:46 +01:00
parent 1b06eee15b
commit 352414b29d
17 changed files with 388 additions and 351 deletions

View file

@ -148,7 +148,16 @@ export default class ScriptUtils {
const data = await ScriptUtils.Download(url, headers)
return JSON.parse(data["content"])
}
public static async DownloadFetch(
url: string,
headers?: any
): Promise<{ content: string } | { redirect: string }> {
console.log("Fetching", url)
const req = await fetch(url, {headers})
const data= await req.text()
console.log("Fetched", url,data)
return {content: data}
}
public static Download(
url: string,
headers?: any

View file

@ -0,0 +1,80 @@
import fs from "fs"
// import readline from "readline"
import Script from "../Script"
import LinkedDataLoader from "../../src/Logic/Web/LinkedDataLoader"
import UrlValidator from "../../src/UI/InputElement/Validators/UrlValidator"
// vite-node scripts/importscripts/compareWebsiteData.ts -- ~/Downloads/ShopsWithWebsiteNodes.csv ~/data/scraped_websites/
/*
class CompareWebsiteData extends Script {
constructor() {
super("Given a csv file with 'id', 'tags' and 'website', attempts to fetch jsonld and compares the attributes. Usage: csv-file datadir")
}
private readonly urlFormatter = new UrlValidator()
async getWithCache(cachedir : string, url: string): Promise<any>{
const filename= cachedir+"/"+encodeURIComponent(url)
if(fs.existsSync(filename)){
return JSON.parse(fs.readFileSync(filename, "utf-8"))
}
const jsonLd = await LinkedDataLoader.fetchJsonLdWithProxy(url)
console.log("Got:", jsonLd)
fs.writeFileSync(filename, JSON.stringify(jsonLd))
return jsonLd
}
async handleEntry(line: string, cachedir: string, targetfile: string) : Promise<boolean>{
const id = JSON.parse(line.split(",")[0])
let tags = line.substring(line.indexOf("{") - 1)
tags = tags.substring(1, tags.length - 1)
tags = tags.replace(/""/g, "\"")
const data = JSON.parse(tags)
const website = data.website //this.urlFormatter.reformat(data.website)
if(!website.startsWith("https://stores.delhaize.be")){
return false
}
console.log(website)
const jsonld = await this.getWithCache(cachedir, website)
console.log(jsonld)
if(Object.keys(jsonld).length === 0){
return false
}
const diff = LinkedDataLoader.removeDuplicateData(jsonld, data)
fs.appendFileSync(targetfile, id +", "+ JSON.stringify(diff)+"\n")
return true
}
async main(args: string[]): Promise<void> {
if (args.length < 2) {
throw "Not enough arguments"
}
const readInterface = readline.createInterface({
input: fs.createReadStream(args[0]),
})
let handled = 0
let diffed = 0
const targetfile = "diff.csv"
fs.writeFileSync(targetfile, "id, diff-json\n")
for await (const line of readInterface) {
try {
if(line.startsWith("\"id\"")){
continue
}
const madeComparison = await this.handleEntry(line, args[1], targetfile)
handled ++
diffed = diffed + (madeComparison ? 1 : 0)
if(handled % 1000 == 0){
// console.log("Handled ",handled," got ",diffed,"diff results")
}
} catch (e) {
// console.error(e)
}
}
}
}
new CompareWebsiteData().run()
*/

0
scripts/scrapeOsm.ts Normal file
View file

View file

@ -15,6 +15,7 @@ class ServerLdScrape extends Script {
mimetype: "application/ld+json",
async handle(content, searchParams: URLSearchParams) {
const url = searchParams.get("url")
console.log("Fetching", url)
if (cache[url]) {
return JSON.stringify(cache[url])
}

View file

@ -1,39 +1,42 @@
import Script from "../Script"
import { Utils } from "../../src/Utils"
import VeloparkLoader, { VeloparkData } from "../../src/Logic/Web/VeloparkLoader"
import fs from "fs"
import { Overpass } from "../../src/Logic/Osm/Overpass"
import { RegexTag } from "../../src/Logic/Tags/RegexTag"
import Constants from "../../src/Models/Constants"
import { ImmutableStore } from "../../src/Logic/UIEventSource"
import { BBox } from "../../src/Logic/BBox"
import LinkedDataLoader from "../../src/Logic/Web/LinkedDataLoader"
class VeloParkToGeojson extends Script {
constructor() {
super(
"Downloads the latest Velopark data and converts it to a geojson, which will be saved at the current directory"
"Downloads the latest Velopark data and converts it to a geojson, which will be saved at the current directory",
)
}
exportTo(filename: string, features) {
fs.writeFileSync(
filename + "_" + new Date().toISOString() + ".geojson",
features = features.slice(0,25) // TODO REMOVE
const file = filename + "_" + /*new Date().toISOString() + */".geojson"
fs.writeFileSync(file,
JSON.stringify(
{
type: "FeatureCollection",
"#":"Only 25 features are shown!", // TODO REMOVE
features,
},
null,
" "
)
" ",
),
)
console.log("Written",file)
}
async main(args: string[]): Promise<void> {
console.log("Downloading velopark data")
// Download data for NIS-code 1000. 1000 means: all of belgium
const url = "https://www.velopark.be/api/parkings/1000"
const data = <VeloparkData[]>await Utils.downloadJson(url)
const allVelopark = await LinkedDataLoader.fetchJsonLd(url, { country: "be" })
this.exportTo("velopark_all", allVelopark)
const bboxBelgium = new BBox([
[2.51357303225, 49.5294835476],
@ -44,15 +47,13 @@ class VeloParkToGeojson extends Script {
[],
Constants.defaultOverpassUrls[0],
new ImmutableStore(60 * 5),
false
false,
)
const alreadyLinkedFeatures = await alreadyLinkedQuery.queryGeoJson(bboxBelgium)
const seenIds = new Set<string>(
alreadyLinkedFeatures[0].features.map((f) => f.properties["ref:velopark"])
alreadyLinkedFeatures[0].features.map((f) => f.properties["ref:velopark"]),
)
console.log("OpenStreetMap contains", seenIds.size, "bicycle parkings with a velopark ref")
const allVelopark = data.map((f) => VeloparkLoader.convert(f))
this.exportTo("velopark_all", allVelopark)
const features = allVelopark.filter((f) => !seenIds.has(f.properties["ref:velopark"]))