mirror of
https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data.git
synced 2025-04-11 10:09:29 +02:00
split finding repeated not banned names
this way fully automated process will not fail too early in processing yes, this names should be probably listed but not listing them is not end of the world and processing must work even with such false positives
This commit is contained in:
parent
a3fbe9f1d3
commit
e17d42c5da
2 changed files with 55 additions and 51 deletions
55
10_find_missing_banned_named_parts.py
Normal file
55
10_find_missing_banned_named_parts.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
import qa
|
||||
import data_iterator
|
||||
import shared
|
||||
import matcher
|
||||
|
||||
def main():
|
||||
find_missing_listing_of_commonly_shared_name_parts()
|
||||
|
||||
def find_missing_listing_of_commonly_shared_name_parts():
|
||||
name_parts_by_popularity = {}
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
pure_code = remove_country_codes_from_spider_code(atp_code)
|
||||
if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
|
||||
# lists multiple fuel station brands, resulting in double counting
|
||||
continue
|
||||
loaded_atp_data = matcher.load_atp_without_qa(atp_code)
|
||||
for_inspection = []
|
||||
for atp in loaded_atp_data:
|
||||
# avoid doing full qa for performance reasons
|
||||
atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
|
||||
if atp['tags'] != None:
|
||||
for_inspection.append(atp)
|
||||
split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
|
||||
for part in split_name:
|
||||
if part not in name_parts_by_popularity:
|
||||
name_parts_by_popularity[part] = []
|
||||
if pure_code not in name_parts_by_popularity[part]:
|
||||
name_parts_by_popularity[part].append(pure_code)
|
||||
found_count = 0
|
||||
header_not_shown_yet = True
|
||||
for part, spider_list in name_parts_by_popularity.items():
|
||||
threshold = 6
|
||||
if part in ['super', 'big', 'mart', 'plus']:
|
||||
threshold = 10
|
||||
if len(spider_list) >= threshold:
|
||||
if part not in matcher.common_shared_name_parts():
|
||||
if header_not_shown_yet:
|
||||
print("entries missing in common_shared_name_parts() in matcher.py")
|
||||
header_not_shown_yet = False
|
||||
print(part, spider_list)
|
||||
found_count += 1
|
||||
if found_count > 0:
|
||||
raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess
|
||||
|
||||
|
||||
def remove_country_codes_from_spider_code(atp_code):
|
||||
returned_parts = []
|
||||
for part in atp_code.split("_"):
|
||||
if part not in shared.valid_country_codes():
|
||||
returned_parts.append(part)
|
||||
return "_".join(returned_parts)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,6 +1,3 @@
|
|||
import qa
|
||||
import shared
|
||||
import matcher
|
||||
import rich
|
||||
import osm_bot_abstraction_layer.util_download_file
|
||||
import json
|
||||
|
@ -12,54 +9,6 @@ config = __import__("0_config")
|
|||
|
||||
def main():
|
||||
download_entire_atp_dataset()
|
||||
find_missing_listing_of_commonly_shared_name_parts()
|
||||
|
||||
import data_iterator
|
||||
|
||||
def find_missing_listing_of_commonly_shared_name_parts():
|
||||
name_parts_by_popularity = {}
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
pure_code = remove_country_codes_from_spider_code(atp_code)
|
||||
if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
|
||||
# lists multiple fuel station brands, resulting in double counting
|
||||
continue
|
||||
loaded_atp_data = matcher.load_atp_without_qa(atp_code)
|
||||
for_inspection = []
|
||||
for atp in loaded_atp_data:
|
||||
# avoid doing full qa for performance reasons
|
||||
atp['tags'] = qa.handle_name_and_brand_tags(atp['tags'], atp_code)
|
||||
if atp['tags'] != None:
|
||||
for_inspection.append(atp)
|
||||
split_name = matcher.get_filter_names_from_atp_dataset(for_inspection)
|
||||
for part in split_name:
|
||||
if part not in name_parts_by_popularity:
|
||||
name_parts_by_popularity[part] = []
|
||||
if pure_code not in name_parts_by_popularity[part]:
|
||||
name_parts_by_popularity[part].append(pure_code)
|
||||
found_count = 0
|
||||
header_not_shown_yet = True
|
||||
for part, spider_list in name_parts_by_popularity.items():
|
||||
threshold = 6
|
||||
if part in ['super', 'big', 'mart', 'plus']:
|
||||
threshold = 10
|
||||
if len(spider_list) >= threshold:
|
||||
if part not in matcher.common_shared_name_parts():
|
||||
if header_not_shown_yet:
|
||||
print("entries missing in common_shared_name_parts() in matcher.py")
|
||||
header_not_shown_yet = False
|
||||
print(part, spider_list)
|
||||
found_count += 1
|
||||
if found_count > 0:
|
||||
raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess
|
||||
|
||||
|
||||
def remove_country_codes_from_spider_code(atp_code):
|
||||
returned_parts = []
|
||||
for part in atp_code.split("_"):
|
||||
if part not in shared.valid_country_codes():
|
||||
returned_parts.append(part)
|
||||
return "_".join(returned_parts)
|
||||
|
||||
|
||||
def download_entire_atp_dataset():
|
||||
FULL_ATP_FOLDER = config.atp_cache_folder()
|
||||
|
|
Loading…
Add table
Reference in a new issue