more useful log, closer to intended deployable code

crowning part of matching on missing names
explain better
2025-05-14 05:33:09 +02:00 · 2025-04-18 08:26:51 +02:00 · 2025-04-17 12:45:44 +02:00 · 2025-04-17 12:38:53 +02:00 · 2025-04-17 12:38:43 +02:00 · 2025-04-17 11:06:08 +02:00
6 changed files with 49 additions and 33 deletions
--- a/0_config.py
+++ b/0_config.py
@ -1054,8 +1054,6 @@ def name_from_tags(tags):
        name = tags.get("brand", None)
    if name == None:
        name = tags.get("short_name", None)
-    if name == None:
-        name = tags.get("operator", None)
    if name == None:
        name = str(tags)
    return name
--- a/data_iterator.py
+++ b/data_iterator.py
@ -34,7 +34,7 @@ def spider_codes_iterator_with_data():
                continue


-def all_spider_codes_iterator():
+def all_spider_codes_iterator(): # TODO rename: actually, it skips ignored ones
    for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
        yield atp_code

@ -42,6 +42,8 @@ def all_spider_codes_iterator():
 def spider_codes_and_filepaths_iterator_including_broken_data_ones():
    """
    this one is not parsing .geojson files so will be faster
+
+    skips ignored spiders
    """
    directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
    # TODO: there is no full match between spider codes and their filenames
--- a/matcher.py
+++ b/matcher.py
@ -19,16 +19,21 @@ def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
            returned.append(osm)
    return returned

+def name_giving_keys():
+    return ["name", "short_name", "brand", "brand:en"]

 def is_matching_any_name_part_to_osm_tags(name_part_list, osm_tags):
-    # if changing anything here, please also change this_tag_lists_match debug code
+    # if changing anything here, please also change this_tag_lists_match debug code in tests
    for part in name_part_list:
-        for key in ["name", "short_name", "brand", "brand:en",
-                    "operator",  # https://www.openstreetmap.org/node/1922605509/history - TODO, remove, see https://community.openstreetmap.org/t/brand-poczta-polska-propozycja-automatycznej-edycji-kasowanie-operator-dodanego-na-slepo-przez-nsi/116091
-                    ]:
+        name_giving_key_present = False
+        for key in name_giving_keys():
            if key in osm_tags:
+                name_giving_key_present = True
                if matching_name_part(part, osm_tags[key]):
                    return True
+        if name_giving_key_present == False:
+            return True # nameless objects are matching, as matcher is supposed to be eager 
+            # (we want false positives rather false negatives at that stage)
    return False


@ -170,7 +175,6 @@ def get_matches(osm_data, atp_data):
    # TODO: handle nearby objects with matching feature type or vacant ones
    filtered_osm = filter_with_fuzzy_name_match(osm_data, filter_names)
    osm_index = spatial_index.SpatialIndex(filtered_osm)
-    #print("filtering reduced entry count to", len(filtered_osm), "candidates based on names, now checking", len(atp_data), "ATP candidates by distance")
    for atp in atp_data:
        distance_scan_in_kilometers = config.maximum_missing_shop_distance_in_kilometers()
        best_match = None
--- a/show_data.py
+++ b/show_data.py
@ -84,9 +84,8 @@ class MismatchingNameReportCreator:
        print()

    def is_there_clear_name_match(self, entry):
-        # TODO get rid of operator, honestly it is a hack for Poczta Polska
-        atp_name_claims = {entry.atp_tags.get("name"), entry.atp_tags.get("brand"), entry.atp_tags.get("short_name"), entry.atp_tags.get("operator"), entry.atp_tags.get("atp_listing_name")}
-        osm_name_claims = {entry.osm_match_tags.get("name"), entry.osm_match_tags.get("brand"), entry.osm_match_tags.get("short_name"), entry.atp_tags.get("operator")}
+        atp_name_claims = {entry.atp_tags.get("name"), entry.atp_tags.get("brand"), entry.atp_tags.get("short_name"), entry.atp_tags.get("atp_listing_name")}
+        osm_name_claims = {entry.osm_match_tags.get("name"), entry.osm_match_tags.get("brand"), entry.osm_match_tags.get("short_name")}
        atp_osm_full_name_match = atp_name_claims & osm_name_claims - {None}
        return len(atp_osm_full_name_match) > 0

--- a/test_matching_logic.py
+++ b/test_matching_logic.py
@ -47,10 +47,6 @@ class MatchingTests(unittest.TestCase):
        name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'denns b'}}])
        self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'name': "lebioda"}), False)

-    def test_match_between_operator_and_brand(self):
-        name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'Poczta Polska'}}])
-        self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'name': "FUP Kraków 28", "operator": "Poczta Polska"}), True)
-
    def test_match_on_short_names(self):
        name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'GAP'}}])
        self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'brand': "GAP"}), True)
@ -89,6 +85,8 @@ class MatchingTests(unittest.TestCase):
        osm_data = [MatchingTests.package_tags_into_mock(osm_tags)]
        matches = matcher.get_matches(osm_data, atp_data)
        if debug:
+            # this recreates matcher.get_matches but has some extra logging that is supposed to make deubugging easier
+            # when test fail for some reason
            print()
            # based on matcher.get_matches internals
            filter_names = matcher.get_filter_names_from_atp_dataset(atp_data)
@ -96,13 +94,15 @@ class MatchingTests(unittest.TestCase):
            print("claimed name match", matcher.is_matching_any_name_part_to_osm_tags(filter_names, osm_tags))
            # replicates matcher.is_matching_any_name_part_to_osm_tags
            name_part_list = filter_names
+            name_giving_key_present = False
            for part in name_part_list:
-                for key in ["name", "brand", "brand:en",
-                            "operator",
-                            ]:
+                for key in matcher.name_giving_keys():
                    if key in osm_tags:
+                        name_giving_key_present = True
                        if matcher.matching_name_part(part, osm_tags[key]):
                            print(part, "matches to osm_tags[key] with value", osm_tags[key], "for key", key)
+            if name_giving_key_present == False:
+                print("matched as nameless one")

            print(matcher.filter_with_fuzzy_name_match(osm_data, filter_names))
            print(str(matches[0]))
@ -114,6 +114,12 @@ class MatchingTests(unittest.TestCase):
    def test_reject_mismatches_based_on_object_type(self):
        self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'shop': 'pastry'}, {'brand': "Titan", 'amenity': 'fuel'}), False)

+    def test_reject_mismatches_based_on_object_name(self):
+        self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'shop': 'pastry'}, {'brand': "Sauron", 'shop': 'pastry'}), False)
+
+    def test_accept_matches_for_nameless_ones(self):
+        self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'amenity': 'ice_cream'}, {'shop': 'ice_cream'}), True)
+
    def test_accept_matches_for_ice_cream_synonyms(self):
        self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'amenity': 'ice_cream'}, {'brand': "Titan", 'shop': 'ice_cream'}), True)

--- a/view_data_across_atp_datasets.py
+++ b/view_data_across_atp_datasets.py
@ -5,6 +5,7 @@ import os
 import json
 import compute_local_time
 import datetime
+from collections import defaultdict

 config = __import__("0_config")
 obtain_atp_data = __import__("2_obtain_atp_data")
@ -13,7 +14,6 @@ obtain_atp_data = __import__("2_obtain_atp_data")
 def download():
    for entry in obtain_atp_data.get_atp_history()[::-1]:
        run_id = entry['run_id']
-        print(run_id)
        obtain_atp_data.download_specific_atp_run(run_id)


@ -35,6 +35,7 @@ def view():
    # for zabka see
    # https://github.com/alltheplaces/alltheplaces/issues/5888 https://github.com/alltheplaces/alltheplaces/pull/5586
    atp_code = "zabka_pl"
+    collected_for_given_weekday = defaultdict(list)
    for entry in obtain_atp_data.get_atp_history()[::-1]:
        run_id = entry['run_id']
        run_metadata = obtain_atp_data.get_atp_specific_spider_run_metadata(run_id, atp_code)
@ -42,26 +43,19 @@ def view():
        print(run_id)
        folder = obtain_atp_data.atp_unpacked_folder(run_id)
        source_atp_filepath = folder + atp_code + ".geojson"
+        if os.path.isfile(source_atp_filepath) == False:
+            print("file", source_atp_filepath, "is missing")
+            continue
        with open(source_atp_filepath) as file:
            try:
                data = json.load(file)
                rich.print("spider:collection_time", data['dataset_attributes'].get("spider:collection_time"))
                for feature in data['features']:
                    if relevant_one(feature):
-                        # rich.print(feature['properties']['addr:city'])
+                        print()
                        opening_hours = feature['properties']['opening_hours']
-                        rich.print(opening_hours)
-                        oh = parser.OpeningHours(opening_hours)
-                        oh.display()
-                        for day in parser.OpeningHours.DAYS_OF_WEEK:
-                            stringified = []
-                            for time_selector in oh.parsed[day]:
-                                stringified.append(str(time_selector)) # .from_hours, .from_minutes, .to_hours, .to_minutes
-                            reduced_to_canonical_string = ",".join(stringified)
-                            print(day, reduced_to_canonical_string)
+                        print(opening_hours)

-                        # rich.print(feature)
-                        rich.print(feature['geometry']['coordinates'])
                        longitude = feature['geometry']['coordinates'][0]
                        latitude = feature['geometry']['coordinates'][1]

@ -71,10 +65,23 @@ def view():
                            print('in local t', start_time)
                            print('in local t', start_time.date())
                            weekday_index = start_time.date().weekday()
+
+                            oh = parser.OpeningHours(opening_hours)
+                            oh.display()
+
                            for offset in range(7):
                                date = start_time.date() + datetime.timedelta(days=offset)
                                day_code = parser.OpeningHours.day_of_week_code(weekday_index + offset)
-                                print(offset, date, day_code)
+
+                                stringified = []
+                                for time_selector in oh.parsed[day_code]:
+                                    stringified.append(str(time_selector)) # .from_hours, .from_minutes, .to_hours, .to_minutes
+                                reduced_to_canonical_string = ",".join(stringified)
+
+                                print(offset, date, day_code, reduced_to_canonical_string)
+
+                                collected_for_given_weekday[day_code].append({"opening_hours": reduced_to_canonical_string, "date": date})
+
                            print('finish_time', run_metadata['finish_time'].replace("T", " "))
                            finish_time = compute_local_time.convert_utc_timestamp_string_to_local(run_metadata['finish_time'], latitude, longitude)
                            print('in local ti', finish_time)
@ -86,7 +93,7 @@ def view():

            except json.decoder.JSONDecodeError as e:
                print(e)
-
+    rich.print(collected_for_given_weekday)
 # {"type": "Feature", "id": "y0P1sd7RSJB-6RBe-vTNFRUDZEg=", "properties": {"ref": "Z2220", "@source_uri": "https://www.zabka.pl/app/uploads/locator-store-data.json", "@spider": "zabka_pl", "name:pl": "\u017babka", "name:uk": "\u0416\u0430\u0431\u043a\u0430", "shop": "convenience", "addr:street_address": "ul. Miko\u0142ajczyka/os. Kalinowe 12c", "addr:city": "Krak\u00f3w", "addr:country": "PL", "name": "\u017babka", "website": "https://www.zabka.pl/znajdz-sklep/ID02220-krakow-ul-mikolajczyka-os-kalinowe-12c/", "opening_hours": "Mo-Sa 06:00-23:00", "brand": "\u017babka", "brand:wikidata": "Q2589061", "nsi_id": "zabka-f55ead"}, "geometry": {"type": "Point", "coordinates": [20.014811, 50.092896]}},
Author	SHA1	Message	Date
Mateusz Konieczny	ce1844befe	more useful log, closer to intended deployable code	2025-04-18 08:26:51 +02:00
Mateusz Konieczny	388d9a3333	crowning part of matching on missing names	2025-04-17 12:45:44 +02:00
Mateusz Konieczny	c6727b19fb	explain better	2025-04-17 12:38:53 +02:00
Mateusz Konieczny	7362c1aff8	remove mismatch root cause	2025-04-17 12:38:43 +02:00
Mateusz Konieczny	b447d1b501	drop dead code	2025-04-17 11:06:08 +02:00
Mateusz Konieczny	81948b4725	point relevant code	2025-04-17 11:03:24 +02:00
Mateusz Konieczny	cb76e93d8e	remove operator hack I fixed Poczta Polska data in Poland	2025-04-17 11:03:05 +02:00
Mateusz Konieczny	1545e90fd9	add test to cover one more case	2025-04-17 10:57:17 +02:00
Mateusz Konieczny	91670013ae	TODO for future	2025-04-17 09:47:56 +02:00
Mateusz Konieczny	9e67921a71	better docs	2025-04-17 09:46:24 +02:00