1
0
Fork 0

Compare commits

...

10 commits

Author SHA1 Message Date
Mateusz Konieczny
ce1844befe more useful log, closer to intended deployable code 2025-04-18 08:26:51 +02:00
Mateusz Konieczny
388d9a3333 crowning part of matching on missing names 2025-04-17 12:45:44 +02:00
Mateusz Konieczny
c6727b19fb explain better 2025-04-17 12:38:53 +02:00
Mateusz Konieczny
7362c1aff8 remove mismatch root cause 2025-04-17 12:38:43 +02:00
Mateusz Konieczny
b447d1b501 drop dead code 2025-04-17 11:06:08 +02:00
Mateusz Konieczny
81948b4725 point relevant code 2025-04-17 11:03:24 +02:00
Mateusz Konieczny
cb76e93d8e remove operator hack
I fixed Poczta Polska data in Poland
2025-04-17 11:03:05 +02:00
Mateusz Konieczny
1545e90fd9 add test to cover one more case 2025-04-17 10:57:17 +02:00
Mateusz Konieczny
91670013ae TODO for future 2025-04-17 09:47:56 +02:00
Mateusz Konieczny
9e67921a71 better docs 2025-04-17 09:46:24 +02:00
6 changed files with 49 additions and 33 deletions

View file

@ -1054,8 +1054,6 @@ def name_from_tags(tags):
name = tags.get("brand", None)
if name == None:
name = tags.get("short_name", None)
if name == None:
name = tags.get("operator", None)
if name == None:
name = str(tags)
return name

View file

@ -34,7 +34,7 @@ def spider_codes_iterator_with_data():
continue
def all_spider_codes_iterator():
def all_spider_codes_iterator(): # TODO rename: actually, it skips ignored ones
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
yield atp_code
@ -42,6 +42,8 @@ def all_spider_codes_iterator():
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
"""
this one is not parsing .geojson files so will be faster
skips ignored spiders
"""
directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
# TODO: there is no full match between spider codes and their filenames

View file

@ -19,16 +19,21 @@ def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
returned.append(osm)
return returned
def name_giving_keys():
return ["name", "short_name", "brand", "brand:en"]
def is_matching_any_name_part_to_osm_tags(name_part_list, osm_tags):
# if changing anything here, please also change this_tag_lists_match debug code
# if changing anything here, please also change this_tag_lists_match debug code in tests
for part in name_part_list:
for key in ["name", "short_name", "brand", "brand:en",
"operator", # https://www.openstreetmap.org/node/1922605509/history - TODO, remove, see https://community.openstreetmap.org/t/brand-poczta-polska-propozycja-automatycznej-edycji-kasowanie-operator-dodanego-na-slepo-przez-nsi/116091
]:
name_giving_key_present = False
for key in name_giving_keys():
if key in osm_tags:
name_giving_key_present = True
if matching_name_part(part, osm_tags[key]):
return True
if name_giving_key_present == False:
return True # nameless objects are matching, as matcher is supposed to be eager
# (we want false positives rather false negatives at that stage)
return False
@ -170,7 +175,6 @@ def get_matches(osm_data, atp_data):
# TODO: handle nearby objects with matching feature type or vacant ones
filtered_osm = filter_with_fuzzy_name_match(osm_data, filter_names)
osm_index = spatial_index.SpatialIndex(filtered_osm)
#print("filtering reduced entry count to", len(filtered_osm), "candidates based on names, now checking", len(atp_data), "ATP candidates by distance")
for atp in atp_data:
distance_scan_in_kilometers = config.maximum_missing_shop_distance_in_kilometers()
best_match = None

View file

@ -84,9 +84,8 @@ class MismatchingNameReportCreator:
print()
def is_there_clear_name_match(self, entry):
# TODO get rid of operator, honestly it is a hack for Poczta Polska
atp_name_claims = {entry.atp_tags.get("name"), entry.atp_tags.get("brand"), entry.atp_tags.get("short_name"), entry.atp_tags.get("operator"), entry.atp_tags.get("atp_listing_name")}
osm_name_claims = {entry.osm_match_tags.get("name"), entry.osm_match_tags.get("brand"), entry.osm_match_tags.get("short_name"), entry.atp_tags.get("operator")}
atp_name_claims = {entry.atp_tags.get("name"), entry.atp_tags.get("brand"), entry.atp_tags.get("short_name"), entry.atp_tags.get("atp_listing_name")}
osm_name_claims = {entry.osm_match_tags.get("name"), entry.osm_match_tags.get("brand"), entry.osm_match_tags.get("short_name")}
atp_osm_full_name_match = atp_name_claims & osm_name_claims - {None}
return len(atp_osm_full_name_match) > 0

View file

@ -47,10 +47,6 @@ class MatchingTests(unittest.TestCase):
name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'denns b'}}])
self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'name': "lebioda"}), False)
def test_match_between_operator_and_brand(self):
name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'Poczta Polska'}}])
self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'name': "FUP Kraków 28", "operator": "Poczta Polska"}), True)
def test_match_on_short_names(self):
name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'GAP'}}])
self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'brand': "GAP"}), True)
@ -89,6 +85,8 @@ class MatchingTests(unittest.TestCase):
osm_data = [MatchingTests.package_tags_into_mock(osm_tags)]
matches = matcher.get_matches(osm_data, atp_data)
if debug:
# this recreates matcher.get_matches but has some extra logging that is supposed to make deubugging easier
# when test fail for some reason
print()
# based on matcher.get_matches internals
filter_names = matcher.get_filter_names_from_atp_dataset(atp_data)
@ -96,13 +94,15 @@ class MatchingTests(unittest.TestCase):
print("claimed name match", matcher.is_matching_any_name_part_to_osm_tags(filter_names, osm_tags))
# replicates matcher.is_matching_any_name_part_to_osm_tags
name_part_list = filter_names
name_giving_key_present = False
for part in name_part_list:
for key in ["name", "brand", "brand:en",
"operator",
]:
for key in matcher.name_giving_keys():
if key in osm_tags:
name_giving_key_present = True
if matcher.matching_name_part(part, osm_tags[key]):
print(part, "matches to osm_tags[key] with value", osm_tags[key], "for key", key)
if name_giving_key_present == False:
print("matched as nameless one")
print(matcher.filter_with_fuzzy_name_match(osm_data, filter_names))
print(str(matches[0]))
@ -114,6 +114,12 @@ class MatchingTests(unittest.TestCase):
def test_reject_mismatches_based_on_object_type(self):
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'shop': 'pastry'}, {'brand': "Titan", 'amenity': 'fuel'}), False)
def test_reject_mismatches_based_on_object_name(self):
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'shop': 'pastry'}, {'brand': "Sauron", 'shop': 'pastry'}), False)
def test_accept_matches_for_nameless_ones(self):
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'amenity': 'ice_cream'}, {'shop': 'ice_cream'}), True)
def test_accept_matches_for_ice_cream_synonyms(self):
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'amenity': 'ice_cream'}, {'brand': "Titan", 'shop': 'ice_cream'}), True)

View file

@ -5,6 +5,7 @@ import os
import json
import compute_local_time
import datetime
from collections import defaultdict
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
@ -13,7 +14,6 @@ obtain_atp_data = __import__("2_obtain_atp_data")
def download():
for entry in obtain_atp_data.get_atp_history()[::-1]:
run_id = entry['run_id']
print(run_id)
obtain_atp_data.download_specific_atp_run(run_id)
@ -35,6 +35,7 @@ def view():
# for zabka see
# https://github.com/alltheplaces/alltheplaces/issues/5888 https://github.com/alltheplaces/alltheplaces/pull/5586
atp_code = "zabka_pl"
collected_for_given_weekday = defaultdict(list)
for entry in obtain_atp_data.get_atp_history()[::-1]:
run_id = entry['run_id']
run_metadata = obtain_atp_data.get_atp_specific_spider_run_metadata(run_id, atp_code)
@ -42,26 +43,19 @@ def view():
print(run_id)
folder = obtain_atp_data.atp_unpacked_folder(run_id)
source_atp_filepath = folder + atp_code + ".geojson"
if os.path.isfile(source_atp_filepath) == False:
print("file", source_atp_filepath, "is missing")
continue
with open(source_atp_filepath) as file:
try:
data = json.load(file)
rich.print("spider:collection_time", data['dataset_attributes'].get("spider:collection_time"))
for feature in data['features']:
if relevant_one(feature):
# rich.print(feature['properties']['addr:city'])
print()
opening_hours = feature['properties']['opening_hours']
rich.print(opening_hours)
oh = parser.OpeningHours(opening_hours)
oh.display()
for day in parser.OpeningHours.DAYS_OF_WEEK:
stringified = []
for time_selector in oh.parsed[day]:
stringified.append(str(time_selector)) # .from_hours, .from_minutes, .to_hours, .to_minutes
reduced_to_canonical_string = ",".join(stringified)
print(day, reduced_to_canonical_string)
print(opening_hours)
# rich.print(feature)
rich.print(feature['geometry']['coordinates'])
longitude = feature['geometry']['coordinates'][0]
latitude = feature['geometry']['coordinates'][1]
@ -71,10 +65,23 @@ def view():
print('in local t', start_time)
print('in local t', start_time.date())
weekday_index = start_time.date().weekday()
oh = parser.OpeningHours(opening_hours)
oh.display()
for offset in range(7):
date = start_time.date() + datetime.timedelta(days=offset)
day_code = parser.OpeningHours.day_of_week_code(weekday_index + offset)
print(offset, date, day_code)
stringified = []
for time_selector in oh.parsed[day_code]:
stringified.append(str(time_selector)) # .from_hours, .from_minutes, .to_hours, .to_minutes
reduced_to_canonical_string = ",".join(stringified)
print(offset, date, day_code, reduced_to_canonical_string)
collected_for_given_weekday[day_code].append({"opening_hours": reduced_to_canonical_string, "date": date})
print('finish_time', run_metadata['finish_time'].replace("T", " "))
finish_time = compute_local_time.convert_utc_timestamp_string_to_local(run_metadata['finish_time'], latitude, longitude)
print('in local ti', finish_time)
@ -86,7 +93,7 @@ def view():
except json.decoder.JSONDecodeError as e:
print(e)
rich.print(collected_for_given_weekday)
# {"type": "Feature", "id": "y0P1sd7RSJB-6RBe-vTNFRUDZEg=", "properties": {"ref": "Z2220", "@source_uri": "https://www.zabka.pl/app/uploads/locator-store-data.json", "@spider": "zabka_pl", "name:pl": "\u017babka", "name:uk": "\u0416\u0430\u0431\u043a\u0430", "shop": "convenience", "addr:street_address": "ul. Miko\u0142ajczyka/os. Kalinowe 12c", "addr:city": "Krak\u00f3w", "addr:country": "PL", "name": "\u017babka", "website": "https://www.zabka.pl/znajdz-sklep/ID02220-krakow-ul-mikolajczyka-os-kalinowe-12c/", "opening_hours": "Mo-Sa 06:00-23:00", "brand": "\u017babka", "brand:wikidata": "Q2589061", "nsi_id": "zabka-f55ead"}, "geometry": {"type": "Point", "coordinates": [20.014811, 50.092896]}},