1
0
Fork 0

split finding repeated not banned names

this way fully automated process will not fail too early in processing
yes, this names should be probably listed
but not listing them is not end of the world
and processing must work even with such false positives
This commit is contained in:
Mateusz Konieczny 2025-02-19 04:30:34 +01:00
parent a3fbe9f1d3
commit e17d42c5da
2 changed files with 55 additions and 51 deletions

View file

@ -0,0 +1,55 @@
import qa
import data_iterator
import shared
import matcher
def main():
find_missing_listing_of_commonly_shared_name_parts()
def find_missing_listing_of_commonly_shared_name_parts():
name_parts_by_popularity = {}
for atp_code in data_iterator.all_spider_codes_iterator():
pure_code = remove_country_codes_from_spider_code(atp_code)
if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
# lists multiple fuel station brands, resulting in double counting
continue
loaded_atp_data = matcher.load_atp_without_qa(atp_code)
for_inspection = []
for atp in loaded_atp_data:
# avoid doing full qa for performance reasons
atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
if atp['tags'] != None:
for_inspection.append(atp)
split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
for part in split_name:
if part not in name_parts_by_popularity:
name_parts_by_popularity[part] = []
if pure_code not in name_parts_by_popularity[part]:
name_parts_by_popularity[part].append(pure_code)
found_count = 0
header_not_shown_yet = True
for part, spider_list in name_parts_by_popularity.items():
threshold = 6
if part in ['super', 'big', 'mart', 'plus']:
threshold = 10
if len(spider_list) >= threshold:
if part not in matcher.common_shared_name_parts():
if header_not_shown_yet:
print("entries missing in common_shared_name_parts() in matcher.py")
header_not_shown_yet = False
print(part, spider_list)
found_count += 1
if found_count > 0:
raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess
def remove_country_codes_from_spider_code(atp_code):
returned_parts = []
for part in atp_code.split("_"):
if part not in shared.valid_country_codes():
returned_parts.append(part)
return "_".join(returned_parts)
if __name__ == "__main__":
main()

View file

@ -1,6 +1,3 @@
import qa
import shared
import matcher
import rich
import osm_bot_abstraction_layer.util_download_file
import json
@ -12,54 +9,6 @@ config = __import__("0_config")
def main():
download_entire_atp_dataset()
find_missing_listing_of_commonly_shared_name_parts()
import data_iterator
def find_missing_listing_of_commonly_shared_name_parts():
name_parts_by_popularity = {}
for atp_code in data_iterator.all_spider_codes_iterator():
pure_code = remove_country_codes_from_spider_code(atp_code)
if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
# lists multiple fuel station brands, resulting in double counting
continue
loaded_atp_data = matcher.load_atp_without_qa(atp_code)
for_inspection = []
for atp in loaded_atp_data:
# avoid doing full qa for performance reasons
atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
if atp['tags'] != None:
for_inspection.append(atp)
split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
for part in split_name:
if part not in name_parts_by_popularity:
name_parts_by_popularity[part] = []
if pure_code not in name_parts_by_popularity[part]:
name_parts_by_popularity[part].append(pure_code)
found_count = 0
header_not_shown_yet = True
for part, spider_list in name_parts_by_popularity.items():
threshold = 6
if part in ['super', 'big', 'mart', 'plus']:
threshold = 10
if len(spider_list) >= threshold:
if part not in matcher.common_shared_name_parts():
if header_not_shown_yet:
print("entries missing in common_shared_name_parts() in matcher.py")
header_not_shown_yet = False
print(part, spider_list)
found_count += 1
if found_count > 0:
raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess
def remove_country_codes_from_spider_code(atp_code):
returned_parts = []
for part in atp_code.split("_"):
if part not in shared.valid_country_codes():
returned_parts.append(part)
return "_".join(returned_parts)
def download_entire_atp_dataset():
FULL_ATP_FOLDER = config.atp_cache_folder()