forked from MapComplete/MapComplete
		
	Conflation script
This commit is contained in:
		
							parent
							
								
									6f7437aa46
								
							
						
					
					
						commit
						8eda65a24f
					
				
					 1 changed files with 211 additions and 0 deletions
				
			
		
							
								
								
									
										211
									
								
								scripts/conflate.ts
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										211
									
								
								scripts/conflate.ts
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,211 @@ | |||
| import Script from "./Script" | ||||
| import fs from "fs" | ||||
| import { Feature } from "geojson" | ||||
| import { GeoOperations } from "../Logic/GeoOperations" | ||||
| import { Utils } from "../Utils" | ||||
| import { OsmObject } from "../Logic/Osm/OsmObject" | ||||
| 
 | ||||
| export class Conflate extends Script { | ||||
|     constructor() { | ||||
|         super( | ||||
|             [ | ||||
|                 "Conflation script", | ||||
|                 "", | ||||
|                 "This script is meant to be used to prepare imports. It takes one 'OSM'-dataset and one external dataset and tries to find an OSM-id for every external item.", | ||||
|                 "", | ||||
|                 "Arguments:", | ||||
|                 "osm_file.geojson external_file.geojson [search_range]", | ||||
|                 "- osm_file.geojson: a file exported from overpass, including meta (note: filename MUST contain either OSM or OpenStreetMap)", | ||||
|                 "- external_file.geojson: the data to import. Tags should be prepared to have an OSM-name", | ||||
|                 "- search_range: max distance at which a match will occur", | ||||
|             ].join("\n") | ||||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     private async findTimeFork( | ||||
|         externalName: string, | ||||
|         osmName: string, | ||||
|         osmId: string | ||||
|     ): Promise<{ earliestDateOfImport; latestDateOfImport }> { | ||||
|         const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0) | ||||
|         let earliest: Date = undefined | ||||
|         let latest: Date = undefined | ||||
|         for (const historyElement of history) { | ||||
|             const csTime = new Date(historyElement.tags["_last_edit:timestamp"]) | ||||
|             if (isNaN(csTime.getTime())) { | ||||
|                 console.error("Could not parse" + historyElement.tags["_last_edit:timestamp"]) | ||||
|                 return undefined | ||||
|             } | ||||
|             const nameIdentical = historyElement.tags.name === externalName | ||||
|             if (nameIdentical) { | ||||
|                 if (earliest == undefined) { | ||||
|                     earliest = csTime | ||||
|                 } | ||||
|                 latest = csTime | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if (history.at(-1).tags.name === externalName) { | ||||
|             // Not changed yet, so no actual hint about when this import could have happened
 | ||||
|             latest = new Date() | ||||
|         } | ||||
| 
 | ||||
|         if (this.earliestDate === undefined || earliest?.getTime() > this.earliestDate?.getTime()) { | ||||
|             this.earliestDate = earliest | ||||
|         } | ||||
|         if (this.latestDate === undefined || latest?.getTime() < this.latestDate?.getTime()) { | ||||
|             this.latestDate = latest | ||||
|         } | ||||
| 
 | ||||
|         return { earliestDateOfImport: earliest, latestDateOfImport: latest } | ||||
|     } | ||||
| 
 | ||||
|     private earliestDate: Date = undefined | ||||
|     private latestDate: Date = undefined | ||||
| 
 | ||||
|     async main(args: string[]): Promise<void> { | ||||
|         const [osm_file_path, external_file_path] = args | ||||
|         let max_range = 50 | ||||
|         if (args.length === 3) { | ||||
|             max_range = Number(args[2]) | ||||
|         } | ||||
|         if ( | ||||
|             osm_file_path.toLowerCase().indexOf("osm") < 0 && | ||||
|             osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 | ||||
|         ) { | ||||
|             throw "OSM File path must contain 'osm' or 'openStreetMap'" | ||||
|         } | ||||
| 
 | ||||
|         if ( | ||||
|             external_file_path.toLowerCase().indexOf("osm") >= 0 || | ||||
|             external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 | ||||
|         ) { | ||||
|             throw "External File path may not contain 'osm' or 'openStreetMap'" | ||||
|         } | ||||
| 
 | ||||
|         const external_features: Feature[] = JSON.parse( | ||||
|             fs.readFileSync(external_file_path, { encoding: "utf-8" }) | ||||
|         ).features | ||||
|         const osm_features: Feature[] = JSON.parse( | ||||
|             fs.readFileSync(osm_file_path, { encoding: "utf-8" }) | ||||
|         ).features | ||||
| 
 | ||||
|         const match_lengths: (string | number)[][] = [ | ||||
|             [ | ||||
|                 "osm_id", | ||||
|                 "external_index", | ||||
|                 "match_distance", | ||||
|                 "name_levenshtein_distance", | ||||
|                 "osm_data", | ||||
|                 "external_data", | ||||
|                 "status", | ||||
|             ], | ||||
|         ] | ||||
|         for (let i = 0; i < external_features.length; i++) { | ||||
|             // console.log("Inspecting " + (i + 1) + "/" + external_features.length)
 | ||||
|             const externalFeature = external_features[i] | ||||
|             const possibleMatches: number[] = [] | ||||
|             for (const osmFeature of osm_features) { | ||||
|                 const d = GeoOperations.distanceBetween( | ||||
|                     GeoOperations.centerpointCoordinates(externalFeature), | ||||
|                     GeoOperations.centerpointCoordinates(osmFeature) | ||||
|                 ) | ||||
| 
 | ||||
|                 if (d === 0) { | ||||
|                     console.log( | ||||
|                         "Found an exact match (name match: ", | ||||
|                         osmFeature.properties.name === externalFeature.properties.name, | ||||
|                         osmFeature.properties.name, | ||||
|                         externalFeature.properties.name | ||||
|                     ) | ||||
|                     continue | ||||
|                 } | ||||
|                 continue | ||||
|                 if (d < max_range) { | ||||
|                     console.log("Found a match") | ||||
|                     match_lengths.push([ | ||||
|                         osmFeature.properties["@id"], | ||||
|                         (i + " " + possibleMatches.join(",")).trim(), | ||||
|                         d, | ||||
|                         this.levenshteinDistancePharmacy( | ||||
|                             externalFeature.properties.name, | ||||
|                             osmFeature.properties.name | ||||
|                         ), | ||||
|                         externalFeature.properties.status, | ||||
|                         ...this.conflate(osmFeature.properties, externalFeature.properties), | ||||
|                     ]) | ||||
|                     possibleMatches.push(osmFeature.properties["@id"]) | ||||
|                     /* | ||||
|                     possibleMatches.push({ | ||||
|                         osmFeature, | ||||
|                         d, | ||||
|                         nameDist: Utils.levenshteinDistance( | ||||
|                             osmFeature.properties.name, | ||||
|                             externalFeature.properties.name | ||||
|                         ), | ||||
|                     })//*/
 | ||||
|                 } | ||||
|                 // possibleMatches.sort((a, b) => b.d - a.d)
 | ||||
|             } | ||||
|         } | ||||
|         match_lengths.sort((a, b) => <number>b[1] - <number>a[1]) | ||||
|         console.log( | ||||
|             "The import probably happened between ", | ||||
|             this.earliestDate?.toISOString(), | ||||
|             "and", | ||||
|             this.latestDate?.toISOString() | ||||
|         ) | ||||
|         fs.writeFileSync( | ||||
|             "../onwheels-data-prep/match_lengths.tsv", | ||||
|             match_lengths.map((l) => l.join("\t")).join("\n") | ||||
|         ) | ||||
|         console.log(match_lengths) | ||||
|     } | ||||
| 
 | ||||
|     private levenshteinDistancePharmacy(a?: string, b?: string) { | ||||
|         a ??= "" | ||||
|         b ??= "" | ||||
|         a = a.toLowerCase() | ||||
|         b = b.toLowerCase() | ||||
|         return Math.min( | ||||
|             ...["", "pharmacie", "apotheek", "pharmacie de", "apotheke"].map((prefix) => | ||||
|                 Math.min( | ||||
|                     Utils.levenshteinDistance(a, prefix + b), | ||||
|                     Utils.levenshteinDistance(prefix + a, b) | ||||
|                 ) | ||||
|             ) | ||||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     private conflate( | ||||
|         osmFeature: Record<string, string>, | ||||
|         externalFeature: Record<string, string> | ||||
|     ): string[] { | ||||
|         const r: string[] = [] | ||||
| 
 | ||||
|         for (const externalFeatureKey in externalFeature) { | ||||
|             if ( | ||||
|                 [ | ||||
|                     "status", | ||||
|                     "healthcare", | ||||
|                     "unmeasurable_reason", | ||||
|                     "timestamp_created", | ||||
|                     "timestamp_last_modified", | ||||
|                 ].indexOf(externalFeatureKey) >= 0 | ||||
|             ) { | ||||
|                 continue | ||||
|             } | ||||
|             const v = externalFeature[externalFeatureKey] | ||||
|             const osmV = osmFeature[externalFeatureKey] | ||||
|             if (osmV === undefined) { | ||||
|                 r.push("+" + externalFeatureKey + "=" + v) | ||||
|             } else if (osmV !== v) { | ||||
|                 r.push("~" + externalFeatureKey + "=" + v + " (osm: " + osmV + ")") | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         return r.map((l) => l.replace(/\n/g, "\\n")) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| new Conflate().run() | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue