From 8eda65a24f129a406c23b7a248bc7b3d3c716b09 Mon Sep 17 00:00:00 2001 From: Pieter Vander Vennet Date: Sun, 5 Mar 2023 23:42:49 +0100 Subject: [PATCH] Conflation script --- scripts/conflate.ts | 211 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 scripts/conflate.ts diff --git a/scripts/conflate.ts b/scripts/conflate.ts new file mode 100644 index 000000000..17d84292f --- /dev/null +++ b/scripts/conflate.ts @@ -0,0 +1,211 @@ +import Script from "./Script" +import fs from "fs" +import { Feature } from "geojson" +import { GeoOperations } from "../Logic/GeoOperations" +import { Utils } from "../Utils" +import { OsmObject } from "../Logic/Osm/OsmObject" + +export class Conflate extends Script { + constructor() { + super( + [ + "Conflation script", + "", + "This script is meant to be used to prepare imports. It takes one 'OSM'-dataset and one external dataset and tries to find an OSM-id for every external item.", + "", + "Arguments:", + "osm_file.geojson external_file.geojson [search_range]", + "- osm_file.geojson: a file exported from overpass, including meta (note: filename MUST contain either OSM or OpenStreetMap)", + "- external_file.geojson: the data to import. Tags should be prepared to have an OSM-name", + "- search_range: max distance at which a match will occur", + ].join("\n") + ) + } + + private async findTimeFork( + externalName: string, + osmName: string, + osmId: string + ): Promise<{ earliestDateOfImport; latestDateOfImport }> { + const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0) + let earliest: Date = undefined + let latest: Date = undefined + for (const historyElement of history) { + const csTime = new Date(historyElement.tags["_last_edit:timestamp"]) + if (isNaN(csTime.getTime())) { + console.error("Could not parse" + historyElement.tags["_last_edit:timestamp"]) + return undefined + } + const nameIdentical = historyElement.tags.name === externalName + if (nameIdentical) { + if (earliest == undefined) { + earliest = csTime + } + latest = csTime + } + } + + if (history.at(-1).tags.name === externalName) { + // Not changed yet, so no actual hint about when this import could have happened + latest = new Date() + } + + if (this.earliestDate === undefined || earliest?.getTime() > this.earliestDate?.getTime()) { + this.earliestDate = earliest + } + if (this.latestDate === undefined || latest?.getTime() < this.latestDate?.getTime()) { + this.latestDate = latest + } + + return { earliestDateOfImport: earliest, latestDateOfImport: latest } + } + + private earliestDate: Date = undefined + private latestDate: Date = undefined + + async main(args: string[]): Promise { + const [osm_file_path, external_file_path] = args + let max_range = 50 + if (args.length === 3) { + max_range = Number(args[2]) + } + if ( + osm_file_path.toLowerCase().indexOf("osm") < 0 && + osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 + ) { + throw "OSM File path must contain 'osm' or 'openStreetMap'" + } + + if ( + external_file_path.toLowerCase().indexOf("osm") >= 0 || + external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 + ) { + throw "External File path may not contain 'osm' or 'openStreetMap'" + } + + const external_features: Feature[] = JSON.parse( + fs.readFileSync(external_file_path, { encoding: "utf-8" }) + ).features + const osm_features: Feature[] = JSON.parse( + fs.readFileSync(osm_file_path, { encoding: "utf-8" }) + ).features + + const match_lengths: (string | number)[][] = [ + [ + "osm_id", + "external_index", + "match_distance", + "name_levenshtein_distance", + "osm_data", + "external_data", + "status", + ], + ] + for (let i = 0; i < external_features.length; i++) { + // console.log("Inspecting " + (i + 1) + "/" + external_features.length) + const externalFeature = external_features[i] + const possibleMatches: number[] = [] + for (const osmFeature of osm_features) { + const d = GeoOperations.distanceBetween( + GeoOperations.centerpointCoordinates(externalFeature), + GeoOperations.centerpointCoordinates(osmFeature) + ) + + if (d === 0) { + console.log( + "Found an exact match (name match: ", + osmFeature.properties.name === externalFeature.properties.name, + osmFeature.properties.name, + externalFeature.properties.name + ) + continue + } + continue + if (d < max_range) { + console.log("Found a match") + match_lengths.push([ + osmFeature.properties["@id"], + (i + " " + possibleMatches.join(",")).trim(), + d, + this.levenshteinDistancePharmacy( + externalFeature.properties.name, + osmFeature.properties.name + ), + externalFeature.properties.status, + ...this.conflate(osmFeature.properties, externalFeature.properties), + ]) + possibleMatches.push(osmFeature.properties["@id"]) + /* + possibleMatches.push({ + osmFeature, + d, + nameDist: Utils.levenshteinDistance( + osmFeature.properties.name, + externalFeature.properties.name + ), + })//*/ + } + // possibleMatches.sort((a, b) => b.d - a.d) + } + } + match_lengths.sort((a, b) => b[1] - a[1]) + console.log( + "The import probably happened between ", + this.earliestDate?.toISOString(), + "and", + this.latestDate?.toISOString() + ) + fs.writeFileSync( + "../onwheels-data-prep/match_lengths.tsv", + match_lengths.map((l) => l.join("\t")).join("\n") + ) + console.log(match_lengths) + } + + private levenshteinDistancePharmacy(a?: string, b?: string) { + a ??= "" + b ??= "" + a = a.toLowerCase() + b = b.toLowerCase() + return Math.min( + ...["", "pharmacie", "apotheek", "pharmacie de", "apotheke"].map((prefix) => + Math.min( + Utils.levenshteinDistance(a, prefix + b), + Utils.levenshteinDistance(prefix + a, b) + ) + ) + ) + } + + private conflate( + osmFeature: Record, + externalFeature: Record + ): string[] { + const r: string[] = [] + + for (const externalFeatureKey in externalFeature) { + if ( + [ + "status", + "healthcare", + "unmeasurable_reason", + "timestamp_created", + "timestamp_last_modified", + ].indexOf(externalFeatureKey) >= 0 + ) { + continue + } + const v = externalFeature[externalFeatureKey] + const osmV = osmFeature[externalFeatureKey] + if (osmV === undefined) { + r.push("+" + externalFeatureKey + "=" + v) + } else if (osmV !== v) { + r.push("~" + externalFeatureKey + "=" + v + " (osm: " + osmV + ")") + } + } + + return r.map((l) => l.replace(/\n/g, "\\n")) + } +} + +new Conflate().run()