Some more experimentation with the conflation script

This commit is contained in:
Pieter Vander Vennet 2023-03-21 20:01:11 +01:00
parent 8eda65a24f
commit 99cb879cfe
3 changed files with 321 additions and 102 deletions

View file

@ -46,7 +46,8 @@
"weblate-merge": "git remote update weblate-github; git merge weblate-github/weblate-mapcomplete-core weblate-github/weblate-mapcomplete-layers weblate-github/weblate-mapcomplete-layer-translations", "weblate-merge": "git remote update weblate-github; git merge weblate-github/weblate-mapcomplete-core weblate-github/weblate-mapcomplete-layers weblate-github/weblate-mapcomplete-layer-translations",
"weblate-fix-heavy": "git fetch weblate-hosted-layers; git fetch weblate-hosted-core; git merge weblate-hosted-layers/master weblate-hosted-core/master ", "weblate-fix-heavy": "git fetch weblate-hosted-layers; git fetch weblate-hosted-core; git merge weblate-hosted-layers/master weblate-hosted-core/master ",
"housekeeping": "git pull && npm run weblate-fix-heavy && npm run generate && npm run generate:docs && npm run generate:contributor-list && vite-node scripts/fetchLanguages.ts && npm run format && git add assets/ langs/ Docs/ **/*.ts Docs/* && git commit -m 'chore: automated housekeeping...'", "housekeeping": "git pull && npm run weblate-fix-heavy && npm run generate && npm run generate:docs && npm run generate:contributor-list && vite-node scripts/fetchLanguages.ts && npm run format && git add assets/ langs/ Docs/ **/*.ts Docs/* && git commit -m 'chore: automated housekeeping...'",
"parseSchools": "vite-node scripts/schools/amendSchoolData.ts" "parseSchools": "vite-node scripts/schools/amendSchoolData.ts",
"conflate": "vite-node scripts/conflate.ts -- ../onwheels-data-prep/osm_pharmacies.geojson ../onwheels-data-prep/OnWheelsData_apotheek.geojson"
}, },
"keywords": [ "keywords": [
"OpenStreetMap", "OpenStreetMap",

View file

@ -146,17 +146,20 @@ export default class ScriptUtils {
private static async DownloadJSON(url: string, headers?: any): Promise<any> { private static async DownloadJSON(url: string, headers?: any): Promise<any> {
const data = await ScriptUtils.Download(url, headers) const data = await ScriptUtils.Download(url, headers)
return JSON.parse(data.content) return JSON.parse(data["content"])
} }
private static Download(url: string, headers?: any): Promise<{ content: string }> { public static Download(
url: string,
headers?: any
): Promise<{ content: string } | { redirect: string }> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
try { try {
headers = headers ?? {} headers = headers ?? {}
headers.accept = "application/json" headers.accept = "application/json"
console.log(" > ScriptUtils.DownloadJson(", url, ")") console.log(" > ScriptUtils.Download(", url, ")")
const urlObj = new URL(url) const urlObj = new URL(url)
https.get( const request = https.get(
{ {
host: urlObj.host, host: urlObj.host,
path: urlObj.pathname + urlObj.search, path: urlObj.pathname + urlObj.search,
@ -173,10 +176,26 @@ export default class ScriptUtils {
}) })
res.addListener("end", function () { res.addListener("end", function () {
if (res.statusCode === 301 || res.statusCode === 302) {
console.log("Got a redirect:", res.headers.location)
resolve({ redirect: res.headers.location })
}
if (res.statusCode >= 400) {
console.log(
"Error while fetching ",
url,
"due to",
res.statusMessage
)
reject(res.statusCode)
}
resolve({ content: parts.join("") }) resolve({ content: parts.join("") })
}) })
} }
) )
request.on("error", function (e) {
reject(e)
})
} catch (e) { } catch (e) {
reject(e) reject(e)
} }

View file

@ -4,8 +4,31 @@ import { Feature } from "geojson"
import { GeoOperations } from "../Logic/GeoOperations" import { GeoOperations } from "../Logic/GeoOperations"
import { Utils } from "../Utils" import { Utils } from "../Utils"
import { OsmObject } from "../Logic/Osm/OsmObject" import { OsmObject } from "../Logic/Osm/OsmObject"
import { PhoneTextField, UrlTextfieldDef } from "../UI/Input/ValidatedTextField"
import { OsmId } from "../Models/OsmFeature"
import ScriptUtils from "./ScriptUtils"
interface PossibleMatch {
/**
* Distance in meter between the OSM-data and the external dataset
*/
d: number
osm_feature: Feature
external_feature: Feature
}
interface ReplayResult {
certainly_imported?: boolean
possibly_imported?: boolean
resting_properties?: Record<string, string>
}
export class Conflate extends Script { export class Conflate extends Script {
private earliestDate: Date = undefined
private latestDate: Date = undefined
private readonly historyCacheDir = "/tmp/cache/"
constructor() { constructor() {
super( super(
[ [
@ -22,10 +45,88 @@ export class Conflate extends Script {
) )
} }
async main(args: string[]): Promise<void> {
const [osm_file_path, external_file_path] = args
let max_range = 50
if (args.length === 3) {
max_range = Number(args[2])
}
if (
osm_file_path.toLowerCase().indexOf("osm") < 0 &&
osm_file_path.toLowerCase().indexOf("openstreetmap") < 0
) {
throw "OSM File path must contain 'osm' or 'openStreetMap'"
}
if (
external_file_path.toLowerCase().indexOf("osm") >= 0 ||
external_file_path.toLowerCase().indexOf("openstreetmap") >= 0
) {
throw "External File path may not contain 'osm' or 'openStreetMap'"
}
const external_features: Feature[] = JSON.parse(
fs.readFileSync(external_file_path, { encoding: "utf-8" })
).features
const osm_features: Feature[] = JSON.parse(
fs.readFileSync(osm_file_path, { encoding: "utf-8" })
).features
const bestMatches = await this.calculateMatches(external_features, osm_features, max_range)
const unmatched = external_features.filter(
(f) => !bestMatches.some((matched) => matched.match.external_feature === f)
)
const match_lengths: (string | number)[][] = [
[
"osm_id",
"match_distance",
"osm_name",
"imported",
"status_external",
"...properties_differences",
],
]
for (const { match, replayed } of bestMatches) {
const { external_feature, d, osm_feature } = match
const { possibly_imported, certainly_imported, resting_properties } = replayed
const status = resting_properties["status"]
delete resting_properties["status"]
if (Object.keys(resting_properties).length === 0) {
continue
}
match_lengths.push([
osm_feature.properties["@id"],
d,
osm_feature.properties.name,
certainly_imported ? "import" : possibly_imported ? "prob import" : "new",
status,
JSON.stringify(resting_properties),
])
}
fs.writeFileSync(
"../onwheels-data-prep/matches.tsv",
match_lengths.map((l) => l.join("\t")).join("\n")
)
fs.writeFileSync(
"../onwheels-data-prep/unmatched.geojson",
JSON.stringify(
{
type: "FeatureCollection",
features: unmatched,
},
null,
" "
)
)
}
private async findTimeFork( private async findTimeFork(
externalName: string, externalName: string,
osmName: string, osmName: string,
osmId: string osmId: OsmId
): Promise<{ earliestDateOfImport; latestDateOfImport }> { ): Promise<{ earliestDateOfImport; latestDateOfImport }> {
const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0) const history = await OsmObject.DownloadHistory(osmId).AsPromise((h) => h.length > 0)
let earliest: Date = undefined let earliest: Date = undefined
@ -60,106 +161,204 @@ export class Conflate extends Script {
return { earliestDateOfImport: earliest, latestDateOfImport: latest } return { earliestDateOfImport: earliest, latestDateOfImport: latest }
} }
private earliestDate: Date = undefined private findPossibleMatchesFor(
private latestDate: Date = undefined osm_features: Feature[],
externalFeature: Feature,
max_range: number
): PossibleMatch[] {
const possibleMatches: PossibleMatch[] = []
for (const osmFeature of osm_features) {
const d = GeoOperations.distanceBetween(
GeoOperations.centerpointCoordinates(externalFeature),
GeoOperations.centerpointCoordinates(osmFeature)
)
async main(args: string[]): Promise<void> { if (d < max_range) {
const [osm_file_path, external_file_path] = args possibleMatches.push({
let max_range = 50 external_feature: externalFeature,
if (args.length === 3) { osm_feature: osmFeature,
max_range = Number(args[2]) d,
} })
if (
osm_file_path.toLowerCase().indexOf("osm") < 0 &&
osm_file_path.toLowerCase().indexOf("openstreetmap") < 0
) {
throw "OSM File path must contain 'osm' or 'openStreetMap'"
}
if (
external_file_path.toLowerCase().indexOf("osm") >= 0 ||
external_file_path.toLowerCase().indexOf("openstreetmap") >= 0
) {
throw "External File path may not contain 'osm' or 'openStreetMap'"
}
const external_features: Feature[] = JSON.parse(
fs.readFileSync(external_file_path, { encoding: "utf-8" })
).features
const osm_features: Feature[] = JSON.parse(
fs.readFileSync(osm_file_path, { encoding: "utf-8" })
).features
const match_lengths: (string | number)[][] = [
[
"osm_id",
"external_index",
"match_distance",
"name_levenshtein_distance",
"osm_data",
"external_data",
"status",
],
]
for (let i = 0; i < external_features.length; i++) {
// console.log("Inspecting " + (i + 1) + "/" + external_features.length)
const externalFeature = external_features[i]
const possibleMatches: number[] = []
for (const osmFeature of osm_features) {
const d = GeoOperations.distanceBetween(
GeoOperations.centerpointCoordinates(externalFeature),
GeoOperations.centerpointCoordinates(osmFeature)
)
if (d === 0) {
console.log(
"Found an exact match (name match: ",
osmFeature.properties.name === externalFeature.properties.name,
osmFeature.properties.name,
externalFeature.properties.name
)
continue
}
continue
if (d < max_range) {
console.log("Found a match")
match_lengths.push([
osmFeature.properties["@id"],
(i + " " + possibleMatches.join(",")).trim(),
d,
this.levenshteinDistancePharmacy(
externalFeature.properties.name,
osmFeature.properties.name
),
externalFeature.properties.status,
...this.conflate(osmFeature.properties, externalFeature.properties),
])
possibleMatches.push(osmFeature.properties["@id"])
/*
possibleMatches.push({
osmFeature,
d,
nameDist: Utils.levenshteinDistance(
osmFeature.properties.name,
externalFeature.properties.name
),
})//*/
}
// possibleMatches.sort((a, b) => b.d - a.d)
} }
} }
match_lengths.sort((a, b) => <number>b[1] - <number>a[1]) return possibleMatches
console.log( }
"The import probably happened between ",
this.earliestDate?.toISOString(), private async stillOnline(url: string): Promise<boolean | string> {
"and", // return true
this.latestDate?.toISOString() if (url.indexOf("facebook.com") > 0) {
return true
}
const cachePath = this.historyCacheDir + "/urls/ " + url.replace(/[/\\:]/g, "_")
if (fs.existsSync(cachePath)) {
const online = JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" }))
return online
}
let online: boolean | string = false
try {
online = await this.stillOnlineUncached(url)
} catch (e) {
console.log(e)
const urlObj = new URL(url)
if (e === "NOT_FOUND" && urlObj.pathname.length > 0) {
console.log("Maybe trying the homepage will help?")
}
}
fs.writeFileSync(cachePath, JSON.stringify(online, null, " "), { encoding: "utf-8" })
return online
}
private async stillOnlineUncached(url: string): Promise<boolean | string> {
if (!url.startsWith("http")) {
url = "https://" + url
}
url = url.replace("http://", "https://")
try {
const result = await ScriptUtils.Download(url)
if (result["redirect"]) {
if (result["redirect"].startsWith("/")) {
return true
}
return result["redirect"]
}
if (result["content"]) {
return true
}
console.error("Got a result, but no content?", url, result)
} catch (e) {
console.log("Offline (error):", url, e.message)
return false
}
}
private async historyCached(id): Promise<OsmObject[]> {
const cachePath = this.historyCacheDir + "/" + id.replace("/", "_")
if (fs.existsSync(cachePath)) {
return JSON.parse(fs.readFileSync(cachePath, { encoding: "utf-8" }))
}
const history = await OsmObject.DownloadHistory(id).AsPromise((l) => l.length > 0)
fs.writeFileSync(cachePath, JSON.stringify(history, null, " "), { encoding: "utf-8" })
return history
}
private async normalize(properties: Record<string, string>) {
if (properties["phone"]) {
properties["phone"] = new PhoneTextField().reformat(properties["phone"], () => "be")
}
if (properties["website"]) {
let website = properties.website.toLowerCase()
website
.replace("http://http://", "http://")
.replace("https//", "https://")
.replace("http://", "https://")
const validator = new UrlTextfieldDef()
if (validator.isValid(website)) {
properties.website = new UrlTextfieldDef().reformat(website)
const stillOnline = await this.stillOnline(website)
if (stillOnline === false) {
delete properties.website
}
if (typeof stillOnline === "string") {
properties.website = stillOnline
}
} else {
console.log("Invalid url:", website)
}
}
if (properties["healthcare"] === "pharmacy") {
// we don't care about this tag
delete properties["healthcare"]
}
}
private async replay(match: PossibleMatch): Promise<ReplayResult> {
const history = await this.historyCached(match.osm_feature.properties["@id"])
let certainly_imported = match.d < 0.0001
let possibly_imported = false
const resting_properties = { ...match.external_feature.properties }
await this.normalize(resting_properties)
for (const historyElement of history) {
await this.normalize(historyElement.tags)
if (historyElement.tags.name === resting_properties.name) {
possibly_imported = true
}
for (const key in resting_properties) {
if (this.str_compare(historyElement.tags[key], resting_properties[key])) {
delete resting_properties[key]
}
}
}
return {
certainly_imported,
possibly_imported,
resting_properties,
}
}
private str_compare(a, b): boolean {
if (a === undefined || b === undefined) {
return false
}
a = a.toLowerCase().replaceAll(/[éèáàüë].*$/g, "")
b = b.toLowerCase().replaceAll(/[éèáàüë].*$/g, "")
return a === b
}
private async calculateMatches(
external_features: Feature[],
osm_features: Feature[],
max_range: number
): Promise<{ match: PossibleMatch; replayed: ReplayResult }[]> {
const matches: { match: PossibleMatch; replayed: ReplayResult }[] = []
for (const f of external_features) {
const match = await this.calculateMatch(osm_features, f, max_range)
if (match) {
matches.push(match)
}
}
return matches
}
private async calculateMatch(
osm_features: Feature[],
externalFeature: Feature,
max_range: number
): Promise<{ match: PossibleMatch; replayed: ReplayResult }> {
const possibleMatches = this.findPossibleMatchesFor(
osm_features,
externalFeature,
max_range
) )
fs.writeFileSync( let bestMatch: PossibleMatch = undefined
"../onwheels-data-prep/match_lengths.tsv", let bestMatchReplayed: ReplayResult = undefined
match_lengths.map((l) => l.join("\t")).join("\n") for (const possibleMatch of possibleMatches) {
) const replayed = await this.replay(possibleMatch)
console.log(match_lengths) if (
bestMatch === undefined ||
(replayed.certainly_imported && !bestMatchReplayed.possibly_imported) ||
(!bestMatchReplayed.certainly_imported &&
replayed.possibly_imported &&
!bestMatchReplayed.possibly_imported)
) {
bestMatch = possibleMatch
bestMatchReplayed = replayed
}
}
if (bestMatch === undefined) {
return undefined
}
return {
replayed: bestMatchReplayed,
match: bestMatch,
}
} }
private levenshteinDistancePharmacy(a?: string, b?: string) { private levenshteinDistancePharmacy(a?: string, b?: string) {