forked from MapComplete/MapComplete
		
	Conflation script
This commit is contained in:
		
							parent
							
								
									6f7437aa46
								
							
						
					
					
						commit
						8eda65a24f
					
				
					 1 changed files with 211 additions and 0 deletions
				
			
		
							
								
								
									
										211
									
								
								scripts/conflate.ts
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										211
									
								
								scripts/conflate.ts
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,211 @@ | ||||||
|  | import Script from "./Script" | ||||||
|  | import fs from "fs" | ||||||
|  | import { Feature } from "geojson" | ||||||
|  | import { GeoOperations } from "../Logic/GeoOperations" | ||||||
|  | import { Utils } from "../Utils" | ||||||
|  | import { OsmObject } from "../Logic/Osm/OsmObject" | ||||||
|  | 
 | ||||||
|  | export class Conflate extends Script { | ||||||
|  |     constructor() { | ||||||
|  |         super( | ||||||
|  |             [ | ||||||
|  |                 "Conflation script", | ||||||
|  |                 "", | ||||||
|  |                 "This script is meant to be used to prepare imports. It takes one 'OSM'-dataset and one external dataset and tries to find an OSM-id for every external item.", | ||||||
|  |                 "", | ||||||
|  |                 "Arguments:", | ||||||
|  |                 "osm_file.geojson external_file.geojson [search_range]", | ||||||
|  |                 "- osm_file.geojson: a file exported from overpass, including meta (note: filename MUST contain either OSM or OpenStreetMap)", | ||||||
|  |                 "- external_file.geojson: the data to import. Tags should be prepared to have an OSM-name", | ||||||
|  |                 "- search_range: max distance at which a match will occur", | ||||||
|  |             ].join("\n") | ||||||
|  |         ) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     private async findTimeFork( | ||||||
|  |         externalName: string, | ||||||
|  |         osmName: string, | ||||||
|  |         osmId: string | ||||||
|  |     ): Promise<{ earliestDateOfImport; latestDateOfImport }> { | ||||||
|  |         const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0) | ||||||
|  |         let earliest: Date = undefined | ||||||
|  |         let latest: Date = undefined | ||||||
|  |         for (const historyElement of history) { | ||||||
|  |             const csTime = new Date(historyElement.tags["_last_edit:timestamp"]) | ||||||
|  |             if (isNaN(csTime.getTime())) { | ||||||
|  |                 console.error("Could not parse" + historyElement.tags["_last_edit:timestamp"]) | ||||||
|  |                 return undefined | ||||||
|  |             } | ||||||
|  |             const nameIdentical = historyElement.tags.name === externalName | ||||||
|  |             if (nameIdentical) { | ||||||
|  |                 if (earliest == undefined) { | ||||||
|  |                     earliest = csTime | ||||||
|  |                 } | ||||||
|  |                 latest = csTime | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if (history.at(-1).tags.name === externalName) { | ||||||
|  |             // Not changed yet, so no actual hint about when this import could have happened
 | ||||||
|  |             latest = new Date() | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if (this.earliestDate === undefined || earliest?.getTime() > this.earliestDate?.getTime()) { | ||||||
|  |             this.earliestDate = earliest | ||||||
|  |         } | ||||||
|  |         if (this.latestDate === undefined || latest?.getTime() < this.latestDate?.getTime()) { | ||||||
|  |             this.latestDate = latest | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         return { earliestDateOfImport: earliest, latestDateOfImport: latest } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     private earliestDate: Date = undefined | ||||||
|  |     private latestDate: Date = undefined | ||||||
|  | 
 | ||||||
|  |     async main(args: string[]): Promise<void> { | ||||||
|  |         const [osm_file_path, external_file_path] = args | ||||||
|  |         let max_range = 50 | ||||||
|  |         if (args.length === 3) { | ||||||
|  |             max_range = Number(args[2]) | ||||||
|  |         } | ||||||
|  |         if ( | ||||||
|  |             osm_file_path.toLowerCase().indexOf("osm") < 0 && | ||||||
|  |             osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 | ||||||
|  |         ) { | ||||||
|  |             throw "OSM File path must contain 'osm' or 'openStreetMap'" | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if ( | ||||||
|  |             external_file_path.toLowerCase().indexOf("osm") >= 0 || | ||||||
|  |             external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 | ||||||
|  |         ) { | ||||||
|  |             throw "External File path may not contain 'osm' or 'openStreetMap'" | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         const external_features: Feature[] = JSON.parse( | ||||||
|  |             fs.readFileSync(external_file_path, { encoding: "utf-8" }) | ||||||
|  |         ).features | ||||||
|  |         const osm_features: Feature[] = JSON.parse( | ||||||
|  |             fs.readFileSync(osm_file_path, { encoding: "utf-8" }) | ||||||
|  |         ).features | ||||||
|  | 
 | ||||||
|  |         const match_lengths: (string | number)[][] = [ | ||||||
|  |             [ | ||||||
|  |                 "osm_id", | ||||||
|  |                 "external_index", | ||||||
|  |                 "match_distance", | ||||||
|  |                 "name_levenshtein_distance", | ||||||
|  |                 "osm_data", | ||||||
|  |                 "external_data", | ||||||
|  |                 "status", | ||||||
|  |             ], | ||||||
|  |         ] | ||||||
|  |         for (let i = 0; i < external_features.length; i++) { | ||||||
|  |             // console.log("Inspecting " + (i + 1) + "/" + external_features.length)
 | ||||||
|  |             const externalFeature = external_features[i] | ||||||
|  |             const possibleMatches: number[] = [] | ||||||
|  |             for (const osmFeature of osm_features) { | ||||||
|  |                 const d = GeoOperations.distanceBetween( | ||||||
|  |                     GeoOperations.centerpointCoordinates(externalFeature), | ||||||
|  |                     GeoOperations.centerpointCoordinates(osmFeature) | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |                 if (d === 0) { | ||||||
|  |                     console.log( | ||||||
|  |                         "Found an exact match (name match: ", | ||||||
|  |                         osmFeature.properties.name === externalFeature.properties.name, | ||||||
|  |                         osmFeature.properties.name, | ||||||
|  |                         externalFeature.properties.name | ||||||
|  |                     ) | ||||||
|  |                     continue | ||||||
|  |                 } | ||||||
|  |                 continue | ||||||
|  |                 if (d < max_range) { | ||||||
|  |                     console.log("Found a match") | ||||||
|  |                     match_lengths.push([ | ||||||
|  |                         osmFeature.properties["@id"], | ||||||
|  |                         (i + " " + possibleMatches.join(",")).trim(), | ||||||
|  |                         d, | ||||||
|  |                         this.levenshteinDistancePharmacy( | ||||||
|  |                             externalFeature.properties.name, | ||||||
|  |                             osmFeature.properties.name | ||||||
|  |                         ), | ||||||
|  |                         externalFeature.properties.status, | ||||||
|  |                         ...this.conflate(osmFeature.properties, externalFeature.properties), | ||||||
|  |                     ]) | ||||||
|  |                     possibleMatches.push(osmFeature.properties["@id"]) | ||||||
|  |                     /* | ||||||
|  |                     possibleMatches.push({ | ||||||
|  |                         osmFeature, | ||||||
|  |                         d, | ||||||
|  |                         nameDist: Utils.levenshteinDistance( | ||||||
|  |                             osmFeature.properties.name, | ||||||
|  |                             externalFeature.properties.name | ||||||
|  |                         ), | ||||||
|  |                     })//*/
 | ||||||
|  |                 } | ||||||
|  |                 // possibleMatches.sort((a, b) => b.d - a.d)
 | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         match_lengths.sort((a, b) => <number>b[1] - <number>a[1]) | ||||||
|  |         console.log( | ||||||
|  |             "The import probably happened between ", | ||||||
|  |             this.earliestDate?.toISOString(), | ||||||
|  |             "and", | ||||||
|  |             this.latestDate?.toISOString() | ||||||
|  |         ) | ||||||
|  |         fs.writeFileSync( | ||||||
|  |             "../onwheels-data-prep/match_lengths.tsv", | ||||||
|  |             match_lengths.map((l) => l.join("\t")).join("\n") | ||||||
|  |         ) | ||||||
|  |         console.log(match_lengths) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     private levenshteinDistancePharmacy(a?: string, b?: string) { | ||||||
|  |         a ??= "" | ||||||
|  |         b ??= "" | ||||||
|  |         a = a.toLowerCase() | ||||||
|  |         b = b.toLowerCase() | ||||||
|  |         return Math.min( | ||||||
|  |             ...["", "pharmacie", "apotheek", "pharmacie de", "apotheke"].map((prefix) => | ||||||
|  |                 Math.min( | ||||||
|  |                     Utils.levenshteinDistance(a, prefix + b), | ||||||
|  |                     Utils.levenshteinDistance(prefix + a, b) | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     private conflate( | ||||||
|  |         osmFeature: Record<string, string>, | ||||||
|  |         externalFeature: Record<string, string> | ||||||
|  |     ): string[] { | ||||||
|  |         const r: string[] = [] | ||||||
|  | 
 | ||||||
|  |         for (const externalFeatureKey in externalFeature) { | ||||||
|  |             if ( | ||||||
|  |                 [ | ||||||
|  |                     "status", | ||||||
|  |                     "healthcare", | ||||||
|  |                     "unmeasurable_reason", | ||||||
|  |                     "timestamp_created", | ||||||
|  |                     "timestamp_last_modified", | ||||||
|  |                 ].indexOf(externalFeatureKey) >= 0 | ||||||
|  |             ) { | ||||||
|  |                 continue | ||||||
|  |             } | ||||||
|  |             const v = externalFeature[externalFeatureKey] | ||||||
|  |             const osmV = osmFeature[externalFeatureKey] | ||||||
|  |             if (osmV === undefined) { | ||||||
|  |                 r.push("+" + externalFeatureKey + "=" + v) | ||||||
|  |             } else if (osmV !== v) { | ||||||
|  |                 r.push("~" + externalFeatureKey + "=" + v + " (osm: " + osmV + ")") | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         return r.map((l) => l.replace(/\n/g, "\\n")) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | new Conflate().run() | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue