split finding repeated not banned names

this way fully automated process will not fail too early in processing yes, this names should be probably listed but not listing them is not end of the world and processing must work even with such false positives
2025-04-11 10:09:29 +02:00 · 2025-02-19 04:30:34 +01:00 · 2025-02-19 04:30:34 +01:00 · e17d42c5da
commit e17d42c5da
parent a3fbe9f1d3
2 changed files with 55 additions and 51 deletions
--- a/10_find_missing_banned_named_parts.py
+++ b/10_find_missing_banned_named_parts.py
@ -0,0 +1,55 @@
+import qa
+import data_iterator
+import shared
+import matcher
+
+def main():
+    find_missing_listing_of_commonly_shared_name_parts()
+
+def find_missing_listing_of_commonly_shared_name_parts():
+    name_parts_by_popularity = {}
+    for atp_code in data_iterator.all_spider_codes_iterator():
+        pure_code = remove_country_codes_from_spider_code(atp_code)
+        if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
+            # lists multiple fuel station brands, resulting in double counting
+            continue
+        loaded_atp_data = matcher.load_atp_without_qa(atp_code)
+        for_inspection = []
+        for atp in loaded_atp_data:
+            # avoid doing full qa for performance reasons
+            atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
+            if atp['tags'] != None:
+                for_inspection.append(atp)
+        split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
+        for part in split_name:
+            if part not in name_parts_by_popularity:
+                name_parts_by_popularity[part] = []
+            if pure_code not in name_parts_by_popularity[part]:
+                name_parts_by_popularity[part].append(pure_code)
+    found_count = 0
+    header_not_shown_yet = True
+    for part, spider_list in name_parts_by_popularity.items():
+        threshold = 6
+        if part in ['super', 'big', 'mart', 'plus']:
+            threshold = 10
+        if len(spider_list) >= threshold:
+            if part not in matcher.common_shared_name_parts():
+                if header_not_shown_yet:
+                    print("entries missing in common_shared_name_parts() in matcher.py")
+                    header_not_shown_yet = False
+                print(part, spider_list)
+                found_count += 1
+    if found_count > 0:
+        raise Exception("look at common_shared_name_parts()")  # TODO move this verification to a separate script, I guess
+
+
+def remove_country_codes_from_spider_code(atp_code):
+    returned_parts = []
+    for part in atp_code.split("_"):
+        if part not in shared.valid_country_codes():
+            returned_parts.append(part)
+    return "_".join(returned_parts)
+
+
+if __name__ == "__main__":
+    main()
--- a/2_obtain_atp_data.py
+++ b/2_obtain_atp_data.py
@ -1,6 +1,3 @@
-import qa
-import shared
-import matcher
 import rich
 import osm_bot_abstraction_layer.util_download_file
 import json
@ -12,54 +9,6 @@ config = __import__("0_config")

 def main():
    download_entire_atp_dataset()
-    find_missing_listing_of_commonly_shared_name_parts()
-
-import data_iterator
-
-def find_missing_listing_of_commonly_shared_name_parts():
-    name_parts_by_popularity = {}
-    for atp_code in data_iterator.all_spider_codes_iterator():
-        pure_code = remove_country_codes_from_spider_code(atp_code)
-        if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
-            # lists multiple fuel station brands, resulting in double counting
-            continue
-        loaded_atp_data = matcher.load_atp_without_qa(atp_code)
-        for_inspection = []
-        for atp in loaded_atp_data:
-            # avoid doing full qa for performance reasons
-            atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
-            if atp['tags'] != None:
-                for_inspection.append(atp)
-        split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
-        for part in split_name:
-            if part not in name_parts_by_popularity:
-                name_parts_by_popularity[part] = []
-            if pure_code not in name_parts_by_popularity[part]:
-                name_parts_by_popularity[part].append(pure_code)
-    found_count = 0
-    header_not_shown_yet = True
-    for part, spider_list in name_parts_by_popularity.items():
-        threshold = 6
-        if part in ['super', 'big', 'mart', 'plus']:
-            threshold = 10
-        if len(spider_list) >= threshold:
-            if part not in matcher.common_shared_name_parts():
-                if header_not_shown_yet:
-                    print("entries missing in common_shared_name_parts() in matcher.py")
-                    header_not_shown_yet = False
-                print(part, spider_list)
-                found_count += 1
-    if found_count > 0:
-        raise Exception("look at common_shared_name_parts()")  # TODO move this verification to a separate script, I guess
-
-
-def remove_country_codes_from_spider_code(atp_code):
-    returned_parts = []
-    for part in atp_code.split("_"):
-        if part not in shared.valid_country_codes():
-            returned_parts.append(part)
-    return "_".join(returned_parts)
-

 def download_entire_atp_dataset():
    FULL_ATP_FOLDER = config.atp_cache_folder()