Add linked data module which scrapes websites

This commit is contained in:
Pieter Vander Vennet 2024-02-22 18:58:34 +01:00
parent 2af6af7630
commit 35c31f9861
15 changed files with 870 additions and 130 deletions

View file

@ -0,0 +1,142 @@
import type { Geometry } from "geojson"
import jsonld from "jsonld"
import { OH, OpeningHour } from "../../UI/OpeningHours/OpeningHours"
import { Utils } from "../../Utils"
import PhoneValidator from "../../UI/InputElement/Validators/PhoneValidator"
import EmailValidator from "../../UI/InputElement/Validators/EmailValidator"
import { Validator } from "../../UI/InputElement/Validator"
import UrlValidator from "../../UI/InputElement/Validators/UrlValidator"
export default class LinkedDataLoader {
private static readonly COMPACTING_CONTEXT = {
name: "http://schema.org/name",
website: { "@id": "http://schema.org/url", "@type": "@id" },
phone: { "@id": "http://schema.org/telephone" },
email: { "@id": "http://schema.org/email" },
image: { "@id": "http://schema.org/image", "@type": "@id" },
opening_hours: { "@id": "http://schema.org/openingHoursSpecification" },
openingHours: { "@id": "http://schema.org/openingHours", "@container": "@set" },
geo: { "@id": "http://schema.org/geo" },
}
private static COMPACTING_CONTEXT_OH = {
dayOfWeek: { "@id": "http://schema.org/dayOfWeek", "@container": "@set" },
closes: { "@id": "http://schema.org/closes" },
opens: { "@id": "http://schema.org/opens" },
}
private static formatters: Record<string, Validator> = {
phone: new PhoneValidator(),
email: new EmailValidator(),
website: new UrlValidator(undefined, undefined, true),
}
private static ignoreKeys = [
"http://schema.org/logo",
"http://schema.org/address",
"@type",
"@id",
"@base",
"http://schema.org/contentUrl",
"http://schema.org/datePublished",
"http://schema.org/description",
"http://schema.org/hasMap",
"http://schema.org/priceRange",
"http://schema.org/contactPoint",
]
static async geoToGeometry(geo): Promise<Geometry> {
const context = {
lat: {
"@id": "http://schema.org/latitude",
},
lon: {
"@id": "http://schema.org/longitude", // TODO formatting to decimal should be possible from this type?
},
}
const flattened = await jsonld.compact(geo, context)
return {
type: "Point",
coordinates: [Number(flattened.lon), Number(flattened.lat)],
}
}
/**
* Parses http://schema.org/openingHours
*
* // Weird data format from C&A
* LinkedDataLoader.ohStringToOsmFormat("MO 09:30-18:00 TU 09:30-18:00 WE 09:30-18:00 TH 09:30-18:00 FR 09:30-18:00 SA 09:30-18:00") // => "Mo-Sa 09:30-18:00"
*/
static ohStringToOsmFormat(oh: string) {
oh = oh.toLowerCase()
if (oh === "mo-su") {
return "24/7"
}
const regex = /([a-z]+ [0-9:]+-[0-9:]+) (.*)/
let match = oh.match(regex)
let parts: string[] = []
while (match) {
parts.push(match[1])
oh = match[2]
match = oh?.match(regex)
}
parts.push(oh)
// actually the same as OSM-oh
return OH.simplify(parts.join(";"))
}
static async ohToOsmFormat(openingHoursSpecification): Promise<string> {
const compacted = await jsonld.flatten(
openingHoursSpecification,
<any>LinkedDataLoader.COMPACTING_CONTEXT_OH
)
const spec: any = compacted["@graph"]
let allRules: OpeningHour[] = []
for (const rule of spec) {
const dow: string[] = rule.dayOfWeek.map((dow) => dow.toLowerCase().substring(0, 2))
const opens: string = rule.opens
const closes: string = rule.closes === "23:59" ? "24:00" : rule.closes
allRules.push(...OH.ParseRule(dow + " " + opens + "-" + closes))
}
return OH.ToString(OH.MergeTimes(allRules))
}
static async fetchJsonLd(url: string, country?: string): Promise<Record<string, any>> {
const proxy = "http://127.0.0.1:2346/extractgraph" // "https://cache.mapcomplete.org/extractgraph"
const data = await Utils.downloadJson(`${proxy}?url=${url}`)
const compacted = await jsonld.compact(data, LinkedDataLoader.COMPACTING_CONTEXT)
compacted["opening_hours"] = await LinkedDataLoader.ohToOsmFormat(
compacted["opening_hours"]
)
if (compacted["openingHours"]) {
const ohspec: string[] = compacted["openingHours"]
compacted["opening_hours"] = OH.simplify(
ohspec.map((r) => LinkedDataLoader.ohStringToOsmFormat(r)).join("; ")
)
delete compacted["openingHours"]
}
if (compacted["geo"]) {
compacted["geo"] = <any>await LinkedDataLoader.geoToGeometry(compacted["geo"])
}
for (const k in compacted) {
if (compacted[k] === "") {
delete compacted[k]
continue
}
if (this.ignoreKeys.indexOf(k) >= 0) {
delete compacted[k]
continue
}
const formatter = LinkedDataLoader.formatters[k]
if (formatter) {
if (country) {
compacted[k] = formatter.reformat(<string>compacted[k], () => country)
} else {
compacted[k] = formatter.reformat(<string>compacted[k])
}
}
}
return <any>compacted
}
}