From e17d42c5da41ef2b66925da3eb9a770998177a13 Mon Sep 17 00:00:00 2001
From: Mateusz Konieczny <matkoniecz@gmail.com>
Date: Wed, 19 Feb 2025 04:30:34 +0100
Subject: [PATCH] split finding repeated not banned names

this way fully automated process will not fail too early in processing
yes, this names should be probably listed
but not listing them is not end of the world
and processing must work even with such false positives
---
 10_find_missing_banned_named_parts.py | 55 +++++++++++++++++++++++++++
 2_obtain_atp_data.py                  | 51 -------------------------
 2 files changed, 55 insertions(+), 51 deletions(-)
 create mode 100644 10_find_missing_banned_named_parts.py

diff --git a/10_find_missing_banned_named_parts.py b/10_find_missing_banned_named_parts.py
new file mode 100644
index 0000000..b9d3ec1
--- /dev/null
+++ b/10_find_missing_banned_named_parts.py
@@ -0,0 +1,55 @@
+import qa
+import data_iterator
+import shared
+import matcher
+
+def main():
+    find_missing_listing_of_commonly_shared_name_parts()
+
+def find_missing_listing_of_commonly_shared_name_parts():
+    name_parts_by_popularity = {}
+    for atp_code in data_iterator.all_spider_codes_iterator():
+        pure_code = remove_country_codes_from_spider_code(atp_code)
+        if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
+            # lists multiple fuel station brands, resulting in double counting
+            continue
+        loaded_atp_data = matcher.load_atp_without_qa(atp_code)
+        for_inspection = []
+        for atp in loaded_atp_data:
+            # avoid doing full qa for performance reasons
+            atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
+            if atp['tags'] != None:
+                for_inspection.append(atp)
+        split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
+        for part in split_name:
+            if part not in name_parts_by_popularity:
+                name_parts_by_popularity[part] = []
+            if pure_code not in name_parts_by_popularity[part]:
+                name_parts_by_popularity[part].append(pure_code)
+    found_count = 0
+    header_not_shown_yet = True
+    for part, spider_list in name_parts_by_popularity.items():
+        threshold = 6
+        if part in ['super', 'big', 'mart', 'plus']:
+            threshold = 10
+        if len(spider_list) >= threshold:
+            if part not in matcher.common_shared_name_parts():
+                if header_not_shown_yet:
+                    print("entries missing in common_shared_name_parts() in matcher.py")
+                    header_not_shown_yet = False
+                print(part, spider_list)
+                found_count += 1
+    if found_count > 0:
+        raise Exception("look at common_shared_name_parts()")  # TODO move this verification to a separate script, I guess
+
+
+def remove_country_codes_from_spider_code(atp_code):
+    returned_parts = []
+    for part in atp_code.split("_"):
+        if part not in shared.valid_country_codes():
+            returned_parts.append(part)
+    return "_".join(returned_parts)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py
index ad89bdb..6499153 100644
--- a/2_obtain_atp_data.py
+++ b/2_obtain_atp_data.py
@@ -1,6 +1,3 @@
-import qa
-import shared
-import matcher
 import rich
 import osm_bot_abstraction_layer.util_download_file
 import json
@@ -12,54 +9,6 @@ config = __import__("0_config")
 
 def main():
     download_entire_atp_dataset()
-    find_missing_listing_of_commonly_shared_name_parts()
-
-import data_iterator
-
-def find_missing_listing_of_commonly_shared_name_parts():
-    name_parts_by_popularity = {}
-    for atp_code in data_iterator.all_spider_codes_iterator():
-        pure_code = remove_country_codes_from_spider_code(atp_code)
-        if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
-            # lists multiple fuel station brands, resulting in double counting
-            continue
-        loaded_atp_data = matcher.load_atp_without_qa(atp_code)
-        for_inspection = []
-        for atp in loaded_atp_data:
-            # avoid doing full qa for performance reasons
-            atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
-            if atp['tags'] != None:
-                for_inspection.append(atp)
-        split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
-        for part in split_name:
-            if part not in name_parts_by_popularity:
-                name_parts_by_popularity[part] = []
-            if pure_code not in name_parts_by_popularity[part]:
-                name_parts_by_popularity[part].append(pure_code)
-    found_count = 0
-    header_not_shown_yet = True
-    for part, spider_list in name_parts_by_popularity.items():
-        threshold = 6
-        if part in ['super', 'big', 'mart', 'plus']:
-            threshold = 10
-        if len(spider_list) >= threshold:
-            if part not in matcher.common_shared_name_parts():
-                if header_not_shown_yet:
-                    print("entries missing in common_shared_name_parts() in matcher.py")
-                    header_not_shown_yet = False
-                print(part, spider_list)
-                found_count += 1
-    if found_count > 0:
-        raise Exception("look at common_shared_name_parts()")  # TODO move this verification to a separate script, I guess
-
-
-def remove_country_codes_from_spider_code(atp_code):
-    returned_parts = []
-    for part in atp_code.split("_"):
-        if part not in shared.valid_country_codes():
-            returned_parts.append(part)
-    return "_".join(returned_parts)
-
 
 def download_entire_atp_dataset():
     FULL_ATP_FOLDER = config.atp_cache_folder()