diff --git a/10_find_missing_banned_named_parts.py b/10_find_missing_banned_named_parts.py new file mode 100644 index 0000000..b9d3ec1 --- /dev/null +++ b/10_find_missing_banned_named_parts.py @@ -0,0 +1,55 @@ +import qa +import data_iterator +import shared +import matcher + +def main(): + find_missing_listing_of_commonly_shared_name_parts() + +def find_missing_listing_of_commonly_shared_name_parts(): + name_parts_by_popularity = {} + for atp_code in data_iterator.all_spider_codes_iterator(): + pure_code = remove_country_codes_from_spider_code(atp_code) + if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']: + # lists multiple fuel station brands, resulting in double counting + continue + loaded_atp_data = matcher.load_atp_without_qa(atp_code) + for_inspection = [] + for atp in loaded_atp_data: + # avoid doing full qa for performance reasons + atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code) + if atp['tags'] != None: + for_inspection.append(atp) + split_name = matcher.get_filter_names_from_atp_dataset(for_inspection) + for part in split_name: + if part not in name_parts_by_popularity: + name_parts_by_popularity[part] = [] + if pure_code not in name_parts_by_popularity[part]: + name_parts_by_popularity[part].append(pure_code) + found_count = 0 + header_not_shown_yet = True + for part, spider_list in name_parts_by_popularity.items(): + threshold = 6 + if part in ['super', 'big', 'mart', 'plus']: + threshold = 10 + if len(spider_list) >= threshold: + if part not in matcher.common_shared_name_parts(): + if header_not_shown_yet: + print("entries missing in common_shared_name_parts() in matcher.py") + header_not_shown_yet = False + print(part, spider_list) + found_count += 1 + if found_count > 0: + raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess + + +def remove_country_codes_from_spider_code(atp_code): + returned_parts = [] + for part in atp_code.split("_"): + if part not in shared.valid_country_codes(): + returned_parts.append(part) + return "_".join(returned_parts) + + +if __name__ == "__main__": + main() diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py index ad89bdb..6499153 100644 --- a/2_obtain_atp_data.py +++ b/2_obtain_atp_data.py @@ -1,6 +1,3 @@ -import qa -import shared -import matcher import rich import osm_bot_abstraction_layer.util_download_file import json @@ -12,54 +9,6 @@ config = __import__("0_config") def main(): download_entire_atp_dataset() - find_missing_listing_of_commonly_shared_name_parts() - -import data_iterator - -def find_missing_listing_of_commonly_shared_name_parts(): - name_parts_by_popularity = {} - for atp_code in data_iterator.all_spider_codes_iterator(): - pure_code = remove_country_codes_from_spider_code(atp_code) - if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']: - # lists multiple fuel station brands, resulting in double counting - continue - loaded_atp_data = matcher.load_atp_without_qa(atp_code) - for_inspection = [] - for atp in loaded_atp_data: - # avoid doing full qa for performance reasons - atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code) - if atp['tags'] != None: - for_inspection.append(atp) - split_name = matcher.get_filter_names_from_atp_dataset(for_inspection) - for part in split_name: - if part not in name_parts_by_popularity: - name_parts_by_popularity[part] = [] - if pure_code not in name_parts_by_popularity[part]: - name_parts_by_popularity[part].append(pure_code) - found_count = 0 - header_not_shown_yet = True - for part, spider_list in name_parts_by_popularity.items(): - threshold = 6 - if part in ['super', 'big', 'mart', 'plus']: - threshold = 10 - if len(spider_list) >= threshold: - if part not in matcher.common_shared_name_parts(): - if header_not_shown_yet: - print("entries missing in common_shared_name_parts() in matcher.py") - header_not_shown_yet = False - print(part, spider_list) - found_count += 1 - if found_count > 0: - raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess - - -def remove_country_codes_from_spider_code(atp_code): - returned_parts = [] - for part in atp_code.split("_"): - if part not in shared.valid_country_codes(): - returned_parts.append(part) - return "_".join(returned_parts) - def download_entire_atp_dataset(): FULL_ATP_FOLDER = config.atp_cache_folder()