From 99cb879cfe09bcb5e5fb5f9dcae96f0a908f1f2d Mon Sep 17 00:00:00 2001 From: Pieter Vander Vennet Date: Tue, 21 Mar 2023 20:01:11 +0100 Subject: [PATCH] Some more experimentation with the conflation script --- package.json | 3 +- scripts/ScriptUtils.ts | 27 ++- scripts/conflate.ts | 393 +++++++++++++++++++++++++++++++---------- 3 files changed, 321 insertions(+), 102 deletions(-) diff --git a/package.json b/package.json index 072f4113b..7b37663fd 100644 --- a/package.json +++ b/package.json @@ -46,7 +46,8 @@ "weblate-merge": "git remote update weblate-github; git merge weblate-github/weblate-mapcomplete-core weblate-github/weblate-mapcomplete-layers weblate-github/weblate-mapcomplete-layer-translations", "weblate-fix-heavy": "git fetch weblate-hosted-layers; git fetch weblate-hosted-core; git merge weblate-hosted-layers/master weblate-hosted-core/master ", "housekeeping": "git pull && npm run weblate-fix-heavy && npm run generate && npm run generate:docs && npm run generate:contributor-list && vite-node scripts/fetchLanguages.ts && npm run format && git add assets/ langs/ Docs/ **/*.ts Docs/* && git commit -m 'chore: automated housekeeping...'", - "parseSchools": "vite-node scripts/schools/amendSchoolData.ts" + "parseSchools": "vite-node scripts/schools/amendSchoolData.ts", + "conflate": "vite-node scripts/conflate.ts -- ../onwheels-data-prep/osm_pharmacies.geojson ../onwheels-data-prep/OnWheelsData_apotheek.geojson" }, "keywords": [ "OpenStreetMap", diff --git a/scripts/ScriptUtils.ts b/scripts/ScriptUtils.ts index af36d962f..86a72a505 100644 --- a/scripts/ScriptUtils.ts +++ b/scripts/ScriptUtils.ts @@ -146,17 +146,20 @@ export default class ScriptUtils { private static async DownloadJSON(url: string, headers?: any): Promise { const data = await ScriptUtils.Download(url, headers) - return JSON.parse(data.content) + return JSON.parse(data["content"]) } - private static Download(url: string, headers?: any): Promise<{ content: string }> { + public static Download( + url: string, + headers?: any + ): Promise<{ content: string } | { redirect: string }> { return new Promise((resolve, reject) => { try { headers = headers ?? {} headers.accept = "application/json" - console.log(" > ScriptUtils.DownloadJson(", url, ")") + console.log(" > ScriptUtils.Download(", url, ")") const urlObj = new URL(url) - https.get( + const request = https.get( { host: urlObj.host, path: urlObj.pathname + urlObj.search, @@ -173,10 +176,26 @@ export default class ScriptUtils { }) res.addListener("end", function () { + if (res.statusCode === 301 || res.statusCode === 302) { + console.log("Got a redirect:", res.headers.location) + resolve({ redirect: res.headers.location }) + } + if (res.statusCode >= 400) { + console.log( + "Error while fetching ", + url, + "due to", + res.statusMessage + ) + reject(res.statusCode) + } resolve({ content: parts.join("") }) }) } ) + request.on("error", function (e) { + reject(e) + }) } catch (e) { reject(e) } diff --git a/scripts/conflate.ts b/scripts/conflate.ts index 17d84292f..175072227 100644 --- a/scripts/conflate.ts +++ b/scripts/conflate.ts @@ -4,8 +4,31 @@ import { Feature } from "geojson" import { GeoOperations } from "../Logic/GeoOperations" import { Utils } from "../Utils" import { OsmObject } from "../Logic/Osm/OsmObject" +import { PhoneTextField, UrlTextfieldDef } from "../UI/Input/ValidatedTextField" +import { OsmId } from "../Models/OsmFeature" +import ScriptUtils from "./ScriptUtils" + +interface PossibleMatch { + /** + * Distance in meter between the OSM-data and the external dataset + */ + d: number + + osm_feature: Feature + external_feature: Feature +} + +interface ReplayResult { + certainly_imported?: boolean + possibly_imported?: boolean + resting_properties?: Record +} export class Conflate extends Script { + private earliestDate: Date = undefined + private latestDate: Date = undefined + private readonly historyCacheDir = "/tmp/cache/" + constructor() { super( [ @@ -22,10 +45,88 @@ export class Conflate extends Script { ) } + async main(args: string[]): Promise { + const [osm_file_path, external_file_path] = args + let max_range = 50 + if (args.length === 3) { + max_range = Number(args[2]) + } + if ( + osm_file_path.toLowerCase().indexOf("osm") < 0 && + osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 + ) { + throw "OSM File path must contain 'osm' or 'openStreetMap'" + } + + if ( + external_file_path.toLowerCase().indexOf("osm") >= 0 || + external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 + ) { + throw "External File path may not contain 'osm' or 'openStreetMap'" + } + + const external_features: Feature[] = JSON.parse( + fs.readFileSync(external_file_path, { encoding: "utf-8" }) + ).features + const osm_features: Feature[] = JSON.parse( + fs.readFileSync(osm_file_path, { encoding: "utf-8" }) + ).features + + const bestMatches = await this.calculateMatches(external_features, osm_features, max_range) + const unmatched = external_features.filter( + (f) => !bestMatches.some((matched) => matched.match.external_feature === f) + ) + const match_lengths: (string | number)[][] = [ + [ + "osm_id", + "match_distance", + "osm_name", + "imported", + "status_external", + "...properties_differences", + ], + ] + for (const { match, replayed } of bestMatches) { + const { external_feature, d, osm_feature } = match + const { possibly_imported, certainly_imported, resting_properties } = replayed + const status = resting_properties["status"] + delete resting_properties["status"] + if (Object.keys(resting_properties).length === 0) { + continue + } + match_lengths.push([ + osm_feature.properties["@id"], + d, + osm_feature.properties.name, + certainly_imported ? "import" : possibly_imported ? "prob import" : "new", + status, + JSON.stringify(resting_properties), + ]) + } + + fs.writeFileSync( + "../onwheels-data-prep/matches.tsv", + match_lengths.map((l) => l.join("\t")).join("\n") + ) + + fs.writeFileSync( + "../onwheels-data-prep/unmatched.geojson", + JSON.stringify( + { + type: "FeatureCollection", + features: unmatched, + }, + + null, + " " + ) + ) + } + private async findTimeFork( externalName: string, osmName: string, - osmId: string + osmId: OsmId ): Promise<{ earliestDateOfImport; latestDateOfImport }> { const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0) let earliest: Date = undefined @@ -60,106 +161,204 @@ export class Conflate extends Script { return { earliestDateOfImport: earliest, latestDateOfImport: latest } } - private earliestDate: Date = undefined - private latestDate: Date = undefined + private findPossibleMatchesFor( + osm_features: Feature[], + externalFeature: Feature, + max_range: number + ): PossibleMatch[] { + const possibleMatches: PossibleMatch[] = [] + for (const osmFeature of osm_features) { + const d = GeoOperations.distanceBetween( + GeoOperations.centerpointCoordinates(externalFeature), + GeoOperations.centerpointCoordinates(osmFeature) + ) - async main(args: string[]): Promise { - const [osm_file_path, external_file_path] = args - let max_range = 50 - if (args.length === 3) { - max_range = Number(args[2]) - } - if ( - osm_file_path.toLowerCase().indexOf("osm") < 0 && - osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 - ) { - throw "OSM File path must contain 'osm' or 'openStreetMap'" - } - - if ( - external_file_path.toLowerCase().indexOf("osm") >= 0 || - external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 - ) { - throw "External File path may not contain 'osm' or 'openStreetMap'" - } - - const external_features: Feature[] = JSON.parse( - fs.readFileSync(external_file_path, { encoding: "utf-8" }) - ).features - const osm_features: Feature[] = JSON.parse( - fs.readFileSync(osm_file_path, { encoding: "utf-8" }) - ).features - - const match_lengths: (string | number)[][] = [ - [ - "osm_id", - "external_index", - "match_distance", - "name_levenshtein_distance", - "osm_data", - "external_data", - "status", - ], - ] - for (let i = 0; i < external_features.length; i++) { - // console.log("Inspecting " + (i + 1) + "/" + external_features.length) - const externalFeature = external_features[i] - const possibleMatches: number[] = [] - for (const osmFeature of osm_features) { - const d = GeoOperations.distanceBetween( - GeoOperations.centerpointCoordinates(externalFeature), - GeoOperations.centerpointCoordinates(osmFeature) - ) - - if (d === 0) { - console.log( - "Found an exact match (name match: ", - osmFeature.properties.name === externalFeature.properties.name, - osmFeature.properties.name, - externalFeature.properties.name - ) - continue - } - continue - if (d < max_range) { - console.log("Found a match") - match_lengths.push([ - osmFeature.properties["@id"], - (i + " " + possibleMatches.join(",")).trim(), - d, - this.levenshteinDistancePharmacy( - externalFeature.properties.name, - osmFeature.properties.name - ), - externalFeature.properties.status, - ...this.conflate(osmFeature.properties, externalFeature.properties), - ]) - possibleMatches.push(osmFeature.properties["@id"]) - /* - possibleMatches.push({ - osmFeature, - d, - nameDist: Utils.levenshteinDistance( - osmFeature.properties.name, - externalFeature.properties.name - ), - })//*/ - } - // possibleMatches.sort((a, b) => b.d - a.d) + if (d < max_range) { + possibleMatches.push({ + external_feature: externalFeature, + osm_feature: osmFeature, + d, + }) } } - match_lengths.sort((a, b) => b[1] - a[1]) - console.log( - "The import probably happened between ", - this.earliestDate?.toISOString(), - "and", - this.latestDate?.toISOString() + return possibleMatches + } + + private async stillOnline(url: string): Promise { + // return true + if (url.indexOf("facebook.com") > 0) { + return true + } + const cachePath = this.historyCacheDir + "/urls/ " + url.replace(/[/\\:]/g, "_") + if (fs.existsSync(cachePath)) { + const online = JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" })) + return online + } + let online: boolean | string = false + try { + online = await this.stillOnlineUncached(url) + } catch (e) { + console.log(e) + const urlObj = new URL(url) + if (e === "NOT_FOUND" && urlObj.pathname.length > 0) { + console.log("Maybe trying the homepage will help?") + } + } + fs.writeFileSync(cachePath, JSON.stringify(online, null, " "), { encoding: "utf-8" }) + return online + } + + private async stillOnlineUncached(url: string): Promise { + if (!url.startsWith("http")) { + url = "https://" + url + } + url = url.replace("http://", "https://") + try { + const result = await ScriptUtils.Download(url) + if (result["redirect"]) { + if (result["redirect"].startsWith("/")) { + return true + } + return result["redirect"] + } + if (result["content"]) { + return true + } + console.error("Got a result, but no content?", url, result) + } catch (e) { + console.log("Offline (error):", url, e.message) + return false + } + } + + private async historyCached(id): Promise { + const cachePath = this.historyCacheDir + "/" + id.replace("/", "_") + if (fs.existsSync(cachePath)) { + return JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" })) + } + const history = await OsmObject.DownloadHistory(id).AsPromise((l) => l.length > 0) + fs.writeFileSync(cachePath, JSON.stringify(history, null, " "), { encoding: "utf-8" }) + return history + } + + private async normalize(properties: Record) { + if (properties["phone"]) { + properties["phone"] = new PhoneTextField().reformat(properties["phone"], () => "be") + } + if (properties["website"]) { + let website = properties.website.toLowerCase() + website + .replace("http://http://", "http://") + .replace("https//", "https://") + .replace("http://", "https://") + const validator = new UrlTextfieldDef() + if (validator.isValid(website)) { + properties.website = new UrlTextfieldDef().reformat(website) + const stillOnline = await this.stillOnline(website) + if (stillOnline === false) { + delete properties.website + } + if (typeof stillOnline === "string") { + properties.website = stillOnline + } + } else { + console.log("Invalid url:", website) + } + } + + if (properties["healthcare"] === "pharmacy") { + // we don't care about this tag + delete properties["healthcare"] + } + } + + private async replay(match: PossibleMatch): Promise { + const history = await this.historyCached(match.osm_feature.properties["@id"]) + + let certainly_imported = match.d < 0.0001 + let possibly_imported = false + + const resting_properties = { ...match.external_feature.properties } + await this.normalize(resting_properties) + + for (const historyElement of history) { + await this.normalize(historyElement.tags) + + if (historyElement.tags.name === resting_properties.name) { + possibly_imported = true + } + + for (const key in resting_properties) { + if (this.str_compare(historyElement.tags[key], resting_properties[key])) { + delete resting_properties[key] + } + } + } + + return { + certainly_imported, + possibly_imported, + resting_properties, + } + } + + private str_compare(a, b): boolean { + if (a === undefined || b === undefined) { + return false + } + a = a.toLowerCase().replaceAll(/[éèáàüë].*$/g, "") + b = b.toLowerCase().replaceAll(/[éèáàüë].*$/g, "") + + return a === b + } + + private async calculateMatches( + external_features: Feature[], + osm_features: Feature[], + max_range: number + ): Promise<{ match: PossibleMatch; replayed: ReplayResult }[]> { + const matches: { match: PossibleMatch; replayed: ReplayResult }[] = [] + for (const f of external_features) { + const match = await this.calculateMatch(osm_features, f, max_range) + if (match) { + matches.push(match) + } + } + return matches + } + + private async calculateMatch( + osm_features: Feature[], + externalFeature: Feature, + max_range: number + ): Promise<{ match: PossibleMatch; replayed: ReplayResult }> { + const possibleMatches = this.findPossibleMatchesFor( + osm_features, + externalFeature, + max_range ) - fs.writeFileSync( - "../onwheels-data-prep/match_lengths.tsv", - match_lengths.map((l) => l.join("\t")).join("\n") - ) - console.log(match_lengths) + let bestMatch: PossibleMatch = undefined + let bestMatchReplayed: ReplayResult = undefined + for (const possibleMatch of possibleMatches) { + const replayed = await this.replay(possibleMatch) + if ( + bestMatch === undefined || + (replayed.certainly_imported && !bestMatchReplayed.possibly_imported) || + (!bestMatchReplayed.certainly_imported && + replayed.possibly_imported && + !bestMatchReplayed.possibly_imported) + ) { + bestMatch = possibleMatch + bestMatchReplayed = replayed + } + } + if (bestMatch === undefined) { + return undefined + } + return { + replayed: bestMatchReplayed, + match: bestMatch, + } } private levenshteinDistancePharmacy(a?: string, b?: string) {