forked from MapComplete/MapComplete
		
	Some more experimentation with the conflation script
This commit is contained in:
		
							parent
							
								
									8eda65a24f
								
							
						
					
					
						commit
						99cb879cfe
					
				
					 3 changed files with 321 additions and 102 deletions
				
			
		|  | @ -46,7 +46,8 @@ | |||
|     "weblate-merge": "git remote update weblate-github; git merge weblate-github/weblate-mapcomplete-core weblate-github/weblate-mapcomplete-layers weblate-github/weblate-mapcomplete-layer-translations", | ||||
|     "weblate-fix-heavy": "git fetch weblate-hosted-layers; git fetch weblate-hosted-core; git merge weblate-hosted-layers/master weblate-hosted-core/master ", | ||||
|     "housekeeping": "git pull && npm run weblate-fix-heavy && npm run generate && npm run generate:docs && npm run generate:contributor-list && vite-node scripts/fetchLanguages.ts && npm run format && git add assets/ langs/ Docs/ **/*.ts Docs/* && git commit -m 'chore: automated housekeeping...'", | ||||
|     "parseSchools": "vite-node scripts/schools/amendSchoolData.ts" | ||||
|     "parseSchools": "vite-node scripts/schools/amendSchoolData.ts", | ||||
|     "conflate": "vite-node scripts/conflate.ts -- ../onwheels-data-prep/osm_pharmacies.geojson ../onwheels-data-prep/OnWheelsData_apotheek.geojson" | ||||
|   }, | ||||
|   "keywords": [ | ||||
|     "OpenStreetMap", | ||||
|  |  | |||
|  | @ -146,17 +146,20 @@ export default class ScriptUtils { | |||
| 
 | ||||
|     private static async DownloadJSON(url: string, headers?: any): Promise<any> { | ||||
|         const data = await ScriptUtils.Download(url, headers) | ||||
|         return JSON.parse(data.content) | ||||
|         return JSON.parse(data["content"]) | ||||
|     } | ||||
| 
 | ||||
|     private static Download(url: string, headers?: any): Promise<{ content: string }> { | ||||
|     public static Download( | ||||
|         url: string, | ||||
|         headers?: any | ||||
|     ): Promise<{ content: string } | { redirect: string }> { | ||||
|         return new Promise((resolve, reject) => { | ||||
|             try { | ||||
|                 headers = headers ?? {} | ||||
|                 headers.accept = "application/json" | ||||
|                 console.log(" > ScriptUtils.DownloadJson(", url, ")") | ||||
|                 console.log(" > ScriptUtils.Download(", url, ")") | ||||
|                 const urlObj = new URL(url) | ||||
|                 https.get( | ||||
|                 const request = https.get( | ||||
|                     { | ||||
|                         host: urlObj.host, | ||||
|                         path: urlObj.pathname + urlObj.search, | ||||
|  | @ -173,10 +176,26 @@ export default class ScriptUtils { | |||
|                         }) | ||||
| 
 | ||||
|                         res.addListener("end", function () { | ||||
|                             if (res.statusCode === 301 || res.statusCode === 302) { | ||||
|                                 console.log("Got a redirect:", res.headers.location) | ||||
|                                 resolve({ redirect: res.headers.location }) | ||||
|                             } | ||||
|                             if (res.statusCode >= 400) { | ||||
|                                 console.log( | ||||
|                                     "Error while fetching ", | ||||
|                                     url, | ||||
|                                     "due to", | ||||
|                                     res.statusMessage | ||||
|                                 ) | ||||
|                                 reject(res.statusCode) | ||||
|                             } | ||||
|                             resolve({ content: parts.join("") }) | ||||
|                         }) | ||||
|                     } | ||||
|                 ) | ||||
|                 request.on("error", function (e) { | ||||
|                     reject(e) | ||||
|                 }) | ||||
|             } catch (e) { | ||||
|                 reject(e) | ||||
|             } | ||||
|  |  | |||
|  | @ -4,8 +4,31 @@ import { Feature } from "geojson" | |||
| import { GeoOperations } from "../Logic/GeoOperations" | ||||
| import { Utils } from "../Utils" | ||||
| import { OsmObject } from "../Logic/Osm/OsmObject" | ||||
| import { PhoneTextField, UrlTextfieldDef } from "../UI/Input/ValidatedTextField" | ||||
| import { OsmId } from "../Models/OsmFeature" | ||||
| import ScriptUtils from "./ScriptUtils" | ||||
| 
 | ||||
| interface PossibleMatch { | ||||
|     /** | ||||
|      * Distance in meter between the OSM-data and the external dataset | ||||
|      */ | ||||
|     d: number | ||||
| 
 | ||||
|     osm_feature: Feature | ||||
|     external_feature: Feature | ||||
| } | ||||
| 
 | ||||
| interface ReplayResult { | ||||
|     certainly_imported?: boolean | ||||
|     possibly_imported?: boolean | ||||
|     resting_properties?: Record<string, string> | ||||
| } | ||||
| 
 | ||||
| export class Conflate extends Script { | ||||
|     private earliestDate: Date = undefined | ||||
|     private latestDate: Date = undefined | ||||
|     private readonly historyCacheDir = "/tmp/cache/" | ||||
| 
 | ||||
|     constructor() { | ||||
|         super( | ||||
|             [ | ||||
|  | @ -22,10 +45,88 @@ export class Conflate extends Script { | |||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     async main(args: string[]): Promise<void> { | ||||
|         const [osm_file_path, external_file_path] = args | ||||
|         let max_range = 50 | ||||
|         if (args.length === 3) { | ||||
|             max_range = Number(args[2]) | ||||
|         } | ||||
|         if ( | ||||
|             osm_file_path.toLowerCase().indexOf("osm") < 0 && | ||||
|             osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 | ||||
|         ) { | ||||
|             throw "OSM File path must contain 'osm' or 'openStreetMap'" | ||||
|         } | ||||
| 
 | ||||
|         if ( | ||||
|             external_file_path.toLowerCase().indexOf("osm") >= 0 || | ||||
|             external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 | ||||
|         ) { | ||||
|             throw "External File path may not contain 'osm' or 'openStreetMap'" | ||||
|         } | ||||
| 
 | ||||
|         const external_features: Feature[] = JSON.parse( | ||||
|             fs.readFileSync(external_file_path, { encoding: "utf-8" }) | ||||
|         ).features | ||||
|         const osm_features: Feature[] = JSON.parse( | ||||
|             fs.readFileSync(osm_file_path, { encoding: "utf-8" }) | ||||
|         ).features | ||||
| 
 | ||||
|         const bestMatches = await this.calculateMatches(external_features, osm_features, max_range) | ||||
|         const unmatched = external_features.filter( | ||||
|             (f) => !bestMatches.some((matched) => matched.match.external_feature === f) | ||||
|         ) | ||||
|         const match_lengths: (string | number)[][] = [ | ||||
|             [ | ||||
|                 "osm_id", | ||||
|                 "match_distance", | ||||
|                 "osm_name", | ||||
|                 "imported", | ||||
|                 "status_external", | ||||
|                 "...properties_differences", | ||||
|             ], | ||||
|         ] | ||||
|         for (const { match, replayed } of bestMatches) { | ||||
|             const { external_feature, d, osm_feature } = match | ||||
|             const { possibly_imported, certainly_imported, resting_properties } = replayed | ||||
|             const status = resting_properties["status"] | ||||
|             delete resting_properties["status"] | ||||
|             if (Object.keys(resting_properties).length === 0) { | ||||
|                 continue | ||||
|             } | ||||
|             match_lengths.push([ | ||||
|                 osm_feature.properties["@id"], | ||||
|                 d, | ||||
|                 osm_feature.properties.name, | ||||
|                 certainly_imported ? "import" : possibly_imported ? "prob import" : "new", | ||||
|                 status, | ||||
|                 JSON.stringify(resting_properties), | ||||
|             ]) | ||||
|         } | ||||
| 
 | ||||
|         fs.writeFileSync( | ||||
|             "../onwheels-data-prep/matches.tsv", | ||||
|             match_lengths.map((l) => l.join("\t")).join("\n") | ||||
|         ) | ||||
| 
 | ||||
|         fs.writeFileSync( | ||||
|             "../onwheels-data-prep/unmatched.geojson", | ||||
|             JSON.stringify( | ||||
|                 { | ||||
|                     type: "FeatureCollection", | ||||
|                     features: unmatched, | ||||
|                 }, | ||||
| 
 | ||||
|                 null, | ||||
|                 "  " | ||||
|             ) | ||||
|         ) | ||||
|     } | ||||
| 
 | ||||
|     private async findTimeFork( | ||||
|         externalName: string, | ||||
|         osmName: string, | ||||
|         osmId: string | ||||
|         osmId: OsmId | ||||
|     ): Promise<{ earliestDateOfImport; latestDateOfImport }> { | ||||
|         const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0) | ||||
|         let earliest: Date = undefined | ||||
|  | @ -60,106 +161,204 @@ export class Conflate extends Script { | |||
|         return { earliestDateOfImport: earliest, latestDateOfImport: latest } | ||||
|     } | ||||
| 
 | ||||
|     private earliestDate: Date = undefined | ||||
|     private latestDate: Date = undefined | ||||
| 
 | ||||
|     async main(args: string[]): Promise<void> { | ||||
|         const [osm_file_path, external_file_path] = args | ||||
|         let max_range = 50 | ||||
|         if (args.length === 3) { | ||||
|             max_range = Number(args[2]) | ||||
|         } | ||||
|         if ( | ||||
|             osm_file_path.toLowerCase().indexOf("osm") < 0 && | ||||
|             osm_file_path.toLowerCase().indexOf("openstreetmap") < 0 | ||||
|         ) { | ||||
|             throw "OSM File path must contain 'osm' or 'openStreetMap'" | ||||
|         } | ||||
| 
 | ||||
|         if ( | ||||
|             external_file_path.toLowerCase().indexOf("osm") >= 0 || | ||||
|             external_file_path.toLowerCase().indexOf("openstreetmap") >= 0 | ||||
|         ) { | ||||
|             throw "External File path may not contain 'osm' or 'openStreetMap'" | ||||
|         } | ||||
| 
 | ||||
|         const external_features: Feature[] = JSON.parse( | ||||
|             fs.readFileSync(external_file_path, { encoding: "utf-8" }) | ||||
|         ).features | ||||
|         const osm_features: Feature[] = JSON.parse( | ||||
|             fs.readFileSync(osm_file_path, { encoding: "utf-8" }) | ||||
|         ).features | ||||
| 
 | ||||
|         const match_lengths: (string | number)[][] = [ | ||||
|             [ | ||||
|                 "osm_id", | ||||
|                 "external_index", | ||||
|                 "match_distance", | ||||
|                 "name_levenshtein_distance", | ||||
|                 "osm_data", | ||||
|                 "external_data", | ||||
|                 "status", | ||||
|             ], | ||||
|         ] | ||||
|         for (let i = 0; i < external_features.length; i++) { | ||||
|             // console.log("Inspecting " + (i + 1) + "/" + external_features.length)
 | ||||
|             const externalFeature = external_features[i] | ||||
|             const possibleMatches: number[] = [] | ||||
|     private findPossibleMatchesFor( | ||||
|         osm_features: Feature[], | ||||
|         externalFeature: Feature, | ||||
|         max_range: number | ||||
|     ): PossibleMatch[] { | ||||
|         const possibleMatches: PossibleMatch[] = [] | ||||
|         for (const osmFeature of osm_features) { | ||||
|             const d = GeoOperations.distanceBetween( | ||||
|                 GeoOperations.centerpointCoordinates(externalFeature), | ||||
|                 GeoOperations.centerpointCoordinates(osmFeature) | ||||
|             ) | ||||
| 
 | ||||
|                 if (d === 0) { | ||||
|                     console.log( | ||||
|                         "Found an exact match (name match: ", | ||||
|                         osmFeature.properties.name === externalFeature.properties.name, | ||||
|                         osmFeature.properties.name, | ||||
|                         externalFeature.properties.name | ||||
|                     ) | ||||
|                     continue | ||||
|                 } | ||||
|                 continue | ||||
|             if (d < max_range) { | ||||
|                     console.log("Found a match") | ||||
|                     match_lengths.push([ | ||||
|                         osmFeature.properties["@id"], | ||||
|                         (i + " " + possibleMatches.join(",")).trim(), | ||||
|                         d, | ||||
|                         this.levenshteinDistancePharmacy( | ||||
|                             externalFeature.properties.name, | ||||
|                             osmFeature.properties.name | ||||
|                         ), | ||||
|                         externalFeature.properties.status, | ||||
|                         ...this.conflate(osmFeature.properties, externalFeature.properties), | ||||
|                     ]) | ||||
|                     possibleMatches.push(osmFeature.properties["@id"]) | ||||
|                     /* | ||||
|                 possibleMatches.push({ | ||||
|                         osmFeature, | ||||
|                     external_feature: externalFeature, | ||||
|                     osm_feature: osmFeature, | ||||
|                     d, | ||||
|                         nameDist: Utils.levenshteinDistance( | ||||
|                             osmFeature.properties.name, | ||||
|                             externalFeature.properties.name | ||||
|                         ), | ||||
|                     })//*/
 | ||||
|                 } | ||||
|                 // possibleMatches.sort((a, b) => b.d - a.d)
 | ||||
|                 }) | ||||
|             } | ||||
|         } | ||||
|         match_lengths.sort((a, b) => <number>b[1] - <number>a[1]) | ||||
|         console.log( | ||||
|             "The import probably happened between ", | ||||
|             this.earliestDate?.toISOString(), | ||||
|             "and", | ||||
|             this.latestDate?.toISOString() | ||||
|         return possibleMatches | ||||
|     } | ||||
| 
 | ||||
|     private async stillOnline(url: string): Promise<boolean | string> { | ||||
|         // return true
 | ||||
|         if (url.indexOf("facebook.com") > 0) { | ||||
|             return true | ||||
|         } | ||||
|         const cachePath = this.historyCacheDir + "/urls/    " + url.replace(/[/\\:]/g, "_") | ||||
|         if (fs.existsSync(cachePath)) { | ||||
|             const online = JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" })) | ||||
|             return online | ||||
|         } | ||||
|         let online: boolean | string = false | ||||
|         try { | ||||
|             online = await this.stillOnlineUncached(url) | ||||
|         } catch (e) { | ||||
|             console.log(e) | ||||
|             const urlObj = new URL(url) | ||||
|             if (e === "NOT_FOUND" && urlObj.pathname.length > 0) { | ||||
|                 console.log("Maybe trying the homepage will help?") | ||||
|             } | ||||
|         } | ||||
|         fs.writeFileSync(cachePath, JSON.stringify(online, null, "  "), { encoding: "utf-8" }) | ||||
|         return online | ||||
|     } | ||||
| 
 | ||||
|     private async stillOnlineUncached(url: string): Promise<boolean | string> { | ||||
|         if (!url.startsWith("http")) { | ||||
|             url = "https://" + url | ||||
|         } | ||||
|         url = url.replace("http://", "https://") | ||||
|         try { | ||||
|             const result = await ScriptUtils.Download(url) | ||||
|             if (result["redirect"]) { | ||||
|                 if (result["redirect"].startsWith("/")) { | ||||
|                     return true | ||||
|                 } | ||||
|                 return result["redirect"] | ||||
|             } | ||||
|             if (result["content"]) { | ||||
|                 return true | ||||
|             } | ||||
|             console.error("Got a result, but no content?", url, result) | ||||
|         } catch (e) { | ||||
|             console.log("Offline (error):", url, e.message) | ||||
|             return false | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     private async historyCached(id): Promise<OsmObject[]> { | ||||
|         const cachePath = this.historyCacheDir + "/" + id.replace("/", "_") | ||||
|         if (fs.existsSync(cachePath)) { | ||||
|             return JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" })) | ||||
|         } | ||||
|         const history = await OsmObject.DownloadHistory(id).AsPromise((l) => l.length > 0) | ||||
|         fs.writeFileSync(cachePath, JSON.stringify(history, null, "  "), { encoding: "utf-8" }) | ||||
|         return history | ||||
|     } | ||||
| 
 | ||||
|     private async normalize(properties: Record<string, string>) { | ||||
|         if (properties["phone"]) { | ||||
|             properties["phone"] = new PhoneTextField().reformat(properties["phone"], () => "be") | ||||
|         } | ||||
|         if (properties["website"]) { | ||||
|             let website = properties.website.toLowerCase() | ||||
|             website | ||||
|                 .replace("http://http://", "http://") | ||||
|                 .replace("https//", "https://") | ||||
|                 .replace("http://", "https://") | ||||
|             const validator = new UrlTextfieldDef() | ||||
|             if (validator.isValid(website)) { | ||||
|                 properties.website = new UrlTextfieldDef().reformat(website) | ||||
|                 const stillOnline = await this.stillOnline(website) | ||||
|                 if (stillOnline === false) { | ||||
|                     delete properties.website | ||||
|                 } | ||||
|                 if (typeof stillOnline === "string") { | ||||
|                     properties.website = stillOnline | ||||
|                 } | ||||
|             } else { | ||||
|                 console.log("Invalid url:", website) | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if (properties["healthcare"] === "pharmacy") { | ||||
|             // we don't care about this tag
 | ||||
|             delete properties["healthcare"] | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     private async replay(match: PossibleMatch): Promise<ReplayResult> { | ||||
|         const history = await this.historyCached(match.osm_feature.properties["@id"]) | ||||
| 
 | ||||
|         let certainly_imported = match.d < 0.0001 | ||||
|         let possibly_imported = false | ||||
| 
 | ||||
|         const resting_properties = { ...match.external_feature.properties } | ||||
|         await this.normalize(resting_properties) | ||||
| 
 | ||||
|         for (const historyElement of history) { | ||||
|             await this.normalize(historyElement.tags) | ||||
| 
 | ||||
|             if (historyElement.tags.name === resting_properties.name) { | ||||
|                 possibly_imported = true | ||||
|             } | ||||
| 
 | ||||
|             for (const key in resting_properties) { | ||||
|                 if (this.str_compare(historyElement.tags[key], resting_properties[key])) { | ||||
|                     delete resting_properties[key] | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         return { | ||||
|             certainly_imported, | ||||
|             possibly_imported, | ||||
|             resting_properties, | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     private str_compare(a, b): boolean { | ||||
|         if (a === undefined || b === undefined) { | ||||
|             return false | ||||
|         } | ||||
|         a = a.toLowerCase().replaceAll(/[éèáàüë].*$/g, "") | ||||
|         b = b.toLowerCase().replaceAll(/[éèáàüë].*$/g, "") | ||||
| 
 | ||||
|         return a === b | ||||
|     } | ||||
| 
 | ||||
|     private async calculateMatches( | ||||
|         external_features: Feature[], | ||||
|         osm_features: Feature[], | ||||
|         max_range: number | ||||
|     ): Promise<{ match: PossibleMatch; replayed: ReplayResult }[]> { | ||||
|         const matches: { match: PossibleMatch; replayed: ReplayResult }[] = [] | ||||
|         for (const f of external_features) { | ||||
|             const match = await this.calculateMatch(osm_features, f, max_range) | ||||
|             if (match) { | ||||
|                 matches.push(match) | ||||
|             } | ||||
|         } | ||||
|         return matches | ||||
|     } | ||||
| 
 | ||||
|     private async calculateMatch( | ||||
|         osm_features: Feature[], | ||||
|         externalFeature: Feature, | ||||
|         max_range: number | ||||
|     ): Promise<{ match: PossibleMatch; replayed: ReplayResult }> { | ||||
|         const possibleMatches = this.findPossibleMatchesFor( | ||||
|             osm_features, | ||||
|             externalFeature, | ||||
|             max_range | ||||
|         ) | ||||
|         fs.writeFileSync( | ||||
|             "../onwheels-data-prep/match_lengths.tsv", | ||||
|             match_lengths.map((l) => l.join("\t")).join("\n") | ||||
|         ) | ||||
|         console.log(match_lengths) | ||||
|         let bestMatch: PossibleMatch = undefined | ||||
|         let bestMatchReplayed: ReplayResult = undefined | ||||
|         for (const possibleMatch of possibleMatches) { | ||||
|             const replayed = await this.replay(possibleMatch) | ||||
|             if ( | ||||
|                 bestMatch === undefined || | ||||
|                 (replayed.certainly_imported && !bestMatchReplayed.possibly_imported) || | ||||
|                 (!bestMatchReplayed.certainly_imported && | ||||
|                     replayed.possibly_imported && | ||||
|                     !bestMatchReplayed.possibly_imported) | ||||
|             ) { | ||||
|                 bestMatch = possibleMatch | ||||
|                 bestMatchReplayed = replayed | ||||
|             } | ||||
|         } | ||||
|         if (bestMatch === undefined) { | ||||
|             return undefined | ||||
|         } | ||||
|         return { | ||||
|             replayed: bestMatchReplayed, | ||||
|             match: bestMatch, | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     private levenshteinDistancePharmacy(a?: string, b?: string) { | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue