From e17d42c5da41ef2b66925da3eb9a770998177a13 Mon Sep 17 00:00:00 2001 From: Mateusz Konieczny <matkoniecz@gmail.com> Date: Wed, 19 Feb 2025 04:30:34 +0100 Subject: [PATCH] split finding repeated not banned names this way fully automated process will not fail too early in processing yes, this names should be probably listed but not listing them is not end of the world and processing must work even with such false positives --- 10_find_missing_banned_named_parts.py | 55 +++++++++++++++++++++++++++ 2_obtain_atp_data.py | 51 ------------------------- 2 files changed, 55 insertions(+), 51 deletions(-) create mode 100644 10_find_missing_banned_named_parts.py diff --git a/10_find_missing_banned_named_parts.py b/10_find_missing_banned_named_parts.py new file mode 100644 index 0000000..b9d3ec1 --- /dev/null +++ b/10_find_missing_banned_named_parts.py @@ -0,0 +1,55 @@ +import qa +import data_iterator +import shared +import matcher + +def main(): + find_missing_listing_of_commonly_shared_name_parts() + +def find_missing_listing_of_commonly_shared_name_parts(): + name_parts_by_popularity = {} + for atp_code in data_iterator.all_spider_codes_iterator(): + pure_code = remove_country_codes_from_spider_code(atp_code) + if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']: + # lists multiple fuel station brands, resulting in double counting + continue + loaded_atp_data = matcher.load_atp_without_qa(atp_code) + for_inspection = [] + for atp in loaded_atp_data: + # avoid doing full qa for performance reasons + atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code) + if atp['tags'] != None: + for_inspection.append(atp) + split_name = matcher.get_filter_names_from_atp_dataset(for_inspection) + for part in split_name: + if part not in name_parts_by_popularity: + name_parts_by_popularity[part] = [] + if pure_code not in name_parts_by_popularity[part]: + name_parts_by_popularity[part].append(pure_code) + found_count = 0 + header_not_shown_yet = True + for part, spider_list in name_parts_by_popularity.items(): + threshold = 6 + if part in ['super', 'big', 'mart', 'plus']: + threshold = 10 + if len(spider_list) >= threshold: + if part not in matcher.common_shared_name_parts(): + if header_not_shown_yet: + print("entries missing in common_shared_name_parts() in matcher.py") + header_not_shown_yet = False + print(part, spider_list) + found_count += 1 + if found_count > 0: + raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess + + +def remove_country_codes_from_spider_code(atp_code): + returned_parts = [] + for part in atp_code.split("_"): + if part not in shared.valid_country_codes(): + returned_parts.append(part) + return "_".join(returned_parts) + + +if __name__ == "__main__": + main() diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py index ad89bdb..6499153 100644 --- a/2_obtain_atp_data.py +++ b/2_obtain_atp_data.py @@ -1,6 +1,3 @@ -import qa -import shared -import matcher import rich import osm_bot_abstraction_layer.util_download_file import json @@ -12,54 +9,6 @@ config = __import__("0_config") def main(): download_entire_atp_dataset() - find_missing_listing_of_commonly_shared_name_parts() - -import data_iterator - -def find_missing_listing_of_commonly_shared_name_parts(): - name_parts_by_popularity = {} - for atp_code in data_iterator.all_spider_codes_iterator(): - pure_code = remove_country_codes_from_spider_code(atp_code) - if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']: - # lists multiple fuel station brands, resulting in double counting - continue - loaded_atp_data = matcher.load_atp_without_qa(atp_code) - for_inspection = [] - for atp in loaded_atp_data: - # avoid doing full qa for performance reasons - atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code) - if atp['tags'] != None: - for_inspection.append(atp) - split_name = matcher.get_filter_names_from_atp_dataset(for_inspection) - for part in split_name: - if part not in name_parts_by_popularity: - name_parts_by_popularity[part] = [] - if pure_code not in name_parts_by_popularity[part]: - name_parts_by_popularity[part].append(pure_code) - found_count = 0 - header_not_shown_yet = True - for part, spider_list in name_parts_by_popularity.items(): - threshold = 6 - if part in ['super', 'big', 'mart', 'plus']: - threshold = 10 - if len(spider_list) >= threshold: - if part not in matcher.common_shared_name_parts(): - if header_not_shown_yet: - print("entries missing in common_shared_name_parts() in matcher.py") - header_not_shown_yet = False - print(part, spider_list) - found_count += 1 - if found_count > 0: - raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess - - -def remove_country_codes_from_spider_code(atp_code): - returned_parts = [] - for part in atp_code.split("_"): - if part not in shared.valid_country_codes(): - returned_parts.append(part) - return "_".join(returned_parts) - def download_entire_atp_dataset(): FULL_ATP_FOLDER = config.atp_cache_folder()