From a3fbe9f1d3cbec44d118c0a6cfe4ee509dbc32fc Mon Sep 17 00:00:00 2001 From: Mateusz Konieczny <matkoniecz@gmail.com> Date: Wed, 19 Feb 2025 04:17:42 +0100 Subject: [PATCH] move iterator to iterators --- 12_generate_organic_map_bookmarks.py | 3 +- 17_list_mismatching_brand_wikidata.py | 3 +- ...ascading_values_for_canonical_poi_types.py | 3 +- 20_detect_unhandled_closed_poi.py | 4 +- 21_list_import_status.py | 3 +- 2_obtain_atp_data.py | 45 +------------------ 5_generate_graticule_reports.py | 8 ++-- 81_generate_atp_issue_tracker_report.py | 4 +- 82_list_unusual_tags_present_in_atp.py | 4 +- ...ue_reports_about_poorly_matched_entries.py | 3 +- ...erate_atp_issue_reports_about_bad_names.py | 3 +- ...d_data_across_atp_to_trigger_log_output.py | 5 +-- data_iterator.py | 42 +++++++++++++++++ dump_all_atp_in_area.py | 4 +- 14 files changed, 64 insertions(+), 70 deletions(-) diff --git a/12_generate_organic_map_bookmarks.py b/12_generate_organic_map_bookmarks.py index 52183ee..2f48a4e 100644 --- a/12_generate_organic_map_bookmarks.py +++ b/12_generate_organic_map_bookmarks.py @@ -5,7 +5,6 @@ import json import nominatim import shared import data_iterator -obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") graticule_report = __import__("5_generate_graticule_reports") @@ -44,7 +43,7 @@ def main(): missing_object_per_area[name] = [] missing_opening_hours_per_area[name] = [] - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): for atp in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code): if atp.match_distance != None and atp.match_distance < config.missing_shop_distance_in_kilometers_for_specific_case(atp.atp_tags): # not considered as clearly missing diff --git a/17_list_mismatching_brand_wikidata.py b/17_list_mismatching_brand_wikidata.py index 1d008ee..498660e 100644 --- a/17_list_mismatching_brand_wikidata.py +++ b/17_list_mismatching_brand_wikidata.py @@ -4,7 +4,6 @@ import os import serializing import show_data graticule_report = __import__("5_generate_graticule_reports") -obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") import shared import wikidata @@ -126,7 +125,7 @@ def main(): matching_via_parentage = 0 no_longer_matching_at_all_according_to_matcher = 0 wikidata_mismatch_brand_match_by_claimed_wikidata_id_in_atp = defaultdict(list) - for atp_code in obtain_atp_data.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool. + for atp_code in data_iterator.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool. for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code): if entry.match_distance != None: if entry.match_distance < config.missing_shop_distance_in_kilometers_for_specific_case(entry.atp_tags): diff --git a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py index b5e57f3..6a916ac 100644 --- a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py +++ b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py @@ -1,7 +1,6 @@ import rich import json config = __import__("0_config") -obtain_atp_data = __import__("2_obtain_atp_data") reported = {} @@ -32,7 +31,7 @@ def log_if_unhandled_cascading_found(tags): def main(): - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in config.ignored_atp_codes(): continue filename = config.atp_unpacked_folder() + atp_code + '.geojson' diff --git a/20_detect_unhandled_closed_poi.py b/20_detect_unhandled_closed_poi.py index fb49625..6241dd5 100644 --- a/20_detect_unhandled_closed_poi.py +++ b/20_detect_unhandled_closed_poi.py @@ -2,7 +2,7 @@ import rich import json import qa config = __import__("0_config") -obtain_atp_data = __import__("2_obtain_atp_data") +import data_iterator def log_if_unhandled_closing_found(tags): for key, value in tags.items(): @@ -35,7 +35,7 @@ def log_if_unhandled_closing_found(tags): def main(): - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in config.ignored_atp_codes(): continue filename = config.atp_unpacked_folder() + atp_code + '.geojson' diff --git a/21_list_import_status.py b/21_list_import_status.py index 0f52af9..c8ceb62 100644 --- a/21_list_import_status.py +++ b/21_list_import_status.py @@ -11,7 +11,6 @@ import generate_html import json import opening_hours_parser graticule_report = __import__("5_generate_graticule_reports") -obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") import wikidata import nominatim @@ -97,7 +96,7 @@ def get_import_listing_configuration_for_atp_spiders(): checked_keys_per_atp = {} known_unavailable_listings = {'opening_hours': {}, 'website': {}} - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): if allowed_spider(atp_code) == False: continue checked_keys = [] diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py index 77255a6..ad89bdb 100644 --- a/2_obtain_atp_data.py +++ b/2_obtain_atp_data.py @@ -1,5 +1,4 @@ import qa -import random import shared import matcher import rich @@ -15,10 +14,11 @@ def main(): download_entire_atp_dataset() find_missing_listing_of_commonly_shared_name_parts() +import data_iterator def find_missing_listing_of_commonly_shared_name_parts(): name_parts_by_popularity = {} - for atp_code in all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): pure_code = remove_country_codes_from_spider_code(atp_code) if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']: # lists multiple fuel station brands, resulting in double counting @@ -61,47 +61,6 @@ def remove_country_codes_from_spider_code(atp_code): return "_".join(returned_parts) - -def spider_codes_and_filepaths_iterator_including_broken_data_ones(): - """ - this one is not parsing .geojson files so will be faster - """ - directory_path_with_unpacked_spider_data = config.atp_unpacked_folder() - # TODO: there is no full match between spider codes and their filenames - # see https://github.com/alltheplaces/alltheplaces/issues/9687 - file_list = [] - for item in os.listdir(directory_path_with_unpacked_spider_data): - item_path = os.path.join(directory_path_with_unpacked_spider_data, item) - if os.path.isfile(item_path): - file_list.append(item_path) - # shuffling as having consistent order does not help - # and in cases where script reports error results in initial cases - # being fixed first and subsequent reruns waiting long for - # new logged problems - random.shuffle(file_list) - for item_path in file_list: - atp_code = item_path.replace(directory_path_with_unpacked_spider_data, "").replace('.geojson', "") - if atp_code in config.ignored_atp_codes(): - continue - yield item_path, atp_code - - -def all_spider_codes_iterator(): - for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones(): - yield atp_code - - -def spider_codes_iterator_with_data(): - for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones(): - with open(item_path, 'r') as file: - content = file.read() - try: - features = json.loads(content)["features"] - yield atp_code, features - except json.decoder.JSONDecodeError: - continue - - def download_entire_atp_dataset(): FULL_ATP_FOLDER = config.atp_cache_folder() if os.path.isdir(FULL_ATP_FOLDER) == False: diff --git a/5_generate_graticule_reports.py b/5_generate_graticule_reports.py index d8efd1c..217d394 100644 --- a/5_generate_graticule_reports.py +++ b/5_generate_graticule_reports.py @@ -9,7 +9,7 @@ import leafleter.generator import os import math import shutil -obtain_atp_data = __import__("2_obtain_atp_data") +import data_iterator process_planet = __import__("4_process_planet_file") config = __import__("0_config") @@ -225,7 +225,7 @@ def graticule_data_subpart_atp_data_merge_success_marker_filepath(area): def prepare_atp_graticule_files(graticule_coverage): - atp_codes = list(obtain_atp_data.all_spider_codes_iterator()) + atp_codes = list(data_iterator.all_spider_codes_iterator()) for index, atp_code in enumerate(atp_codes): spider_data_split_success_marker_filepath = graticule_data_specific_spider_subpart_success_marker_filepath(graticule_coverage, atp_code) if os.path.isfile(spider_data_split_success_marker_filepath): @@ -269,7 +269,7 @@ def split_specific_spider_across_graticules(graticule_coverage, atp_code): raise def merge_atp_graticule_files(area): - atp_codes = list(obtain_atp_data.all_spider_codes_iterator()) + atp_codes = list(data_iterator.all_spider_codes_iterator()) delete_merged_graticule_files_for_atp_data_if_any(area) for lat_anchor in range(area['min_lat'], area['max_lat']): for lon_anchor in range(area['min_lon'], area['max_lon']): @@ -432,7 +432,7 @@ def generate_report_for_given_graticule_or_return_cache_if_present(area_name, ar def generate_report_for_given_graticule(area_name, area, lat_anchor, lon_anchor): generated_filepaths = [] report_generators = {} - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): potential_match_results = match_output_for_spider_and_graticule(area, atp_code, lat_anchor, lon_anchor) if os.path.isfile(potential_match_results): matched = serializing.load_list_of_matches_from_csv(potential_match_results) diff --git a/81_generate_atp_issue_tracker_report.py b/81_generate_atp_issue_tracker_report.py index e6e8280..565f936 100644 --- a/81_generate_atp_issue_tracker_report.py +++ b/81_generate_atp_issue_tracker_report.py @@ -6,7 +6,7 @@ import json import os import rich import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge -obtain_atp_data = __import__("2_obtain_atp_data") +import data_iterator qa = __import__("qa") config = __import__("0_config") @@ -47,7 +47,7 @@ def main(): reports['repeated_for_atp_issue_tracker'][key] = "" reports['repeated_machine_readable_for_config_updates'][key] = [] - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): reports = process_atp(atp_code, reports) show_reports(reports) diff --git a/82_list_unusual_tags_present_in_atp.py b/82_list_unusual_tags_present_in_atp.py index 32e6e55..4869cfb 100644 --- a/82_list_unusual_tags_present_in_atp.py +++ b/82_list_unusual_tags_present_in_atp.py @@ -3,8 +3,8 @@ import rich import serializing import taginfo import os +import data_iterator config = __import__("0_config") -obtain_atp_data = __import__("2_obtain_atp_data") graticule_report = __import__("5_generate_graticule_reports") import data_iterator @@ -42,7 +42,7 @@ def collect_data(): used_keys = {} used_tags = {} area = graticule_report.global_graticule_coverage() - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in config.ignored_atp_codes(): continue for atp in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code): diff --git a/83_generate_atp_issue_reports_about_poorly_matched_entries.py b/83_generate_atp_issue_reports_about_poorly_matched_entries.py index 58aeee4..16a5944 100644 --- a/83_generate_atp_issue_reports_about_poorly_matched_entries.py +++ b/83_generate_atp_issue_reports_about_poorly_matched_entries.py @@ -2,7 +2,6 @@ import os import serializing import show_data graticule_report = __import__("5_generate_graticule_reports") -obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") import data_iterator @@ -200,7 +199,7 @@ def nothing_to_report_marker_filepath(atp_code): area = graticule_report.global_graticule_coverage() skipped_as_on_ignore_list_or_empty = [] -for atp_code in obtain_atp_data.all_spider_codes_iterator(): +for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in ignored_atp_codes(): skipped_as_on_ignore_list_or_empty.append(atp_code) continue diff --git a/84_generate_atp_issue_reports_about_bad_names.py b/84_generate_atp_issue_reports_about_bad_names.py index 74b7062..e110743 100644 --- a/84_generate_atp_issue_reports_about_bad_names.py +++ b/84_generate_atp_issue_reports_about_bad_names.py @@ -3,7 +3,6 @@ import os import serializing import show_data graticule_report = __import__("5_generate_graticule_reports") -obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") import data_iterator @@ -15,7 +14,7 @@ def main(): print("has name from ATP that seems to mismatch what is in OSM, while brand matches well. Note, this is based on subset of ATP data matched to OSM using brand field! So at least in theory there could be many cases where `name` from ATP matches while `brand` mismatches. But from specific cases you see a lot of what is rather `branch` and similar.") area = graticule_report.global_graticule_coverage() - for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in [ ]: continue diff --git a/89_run_remove_bad_data_across_atp_to_trigger_log_output.py b/89_run_remove_bad_data_across_atp_to_trigger_log_output.py index d444066..6ec7aba 100644 --- a/89_run_remove_bad_data_across_atp_to_trigger_log_output.py +++ b/89_run_remove_bad_data_across_atp_to_trigger_log_output.py @@ -1,9 +1,8 @@ import qa -obtain_atp_data = __import__("2_obtain_atp_data") - +import data_iterator def main(): - for atp_code, parsed_content in obtain_atp_data.spider_codes_iterator_with_data(): + for atp_code, parsed_content in data_iterator.spider_codes_iterator_with_data(): for entry in parsed_content: if entry['geometry'] == None: # no point in bothering with such cases... diff --git a/data_iterator.py b/data_iterator.py index 256c65c..7196ac2 100644 --- a/data_iterator.py +++ b/data_iterator.py @@ -1,6 +1,9 @@ graticule_report = __import__("5_generate_graticule_reports") import serializing import os +import random +import json +config = __import__("0_config") def iterate_over_all_matches_for_specific_spider(area, atp_code): for lat_anchor in range(area['min_lat'], area['max_lat']): @@ -10,3 +13,42 @@ def iterate_over_all_matches_for_specific_spider(area, atp_code): match_list = serializing.load_list_of_matches_from_csv(output) for entry in match_list: yield entry + +def spider_codes_iterator_with_data(): + for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones(): + with open(item_path, 'r') as file: + content = file.read() + try: + features = json.loads(content)["features"] + yield atp_code, features + except json.decoder.JSONDecodeError: + continue + + +def all_spider_codes_iterator(): + for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones(): + yield atp_code + + +def spider_codes_and_filepaths_iterator_including_broken_data_ones(): + """ + this one is not parsing .geojson files so will be faster + """ + directory_path_with_unpacked_spider_data = config.atp_unpacked_folder() + # TODO: there is no full match between spider codes and their filenames + # see https://github.com/alltheplaces/alltheplaces/issues/9687 + file_list = [] + for item in os.listdir(directory_path_with_unpacked_spider_data): + item_path = os.path.join(directory_path_with_unpacked_spider_data, item) + if os.path.isfile(item_path): + file_list.append(item_path) + # shuffling as having consistent order does not help + # and in cases where script reports error results in initial cases + # being fixed first and subsequent reruns waiting long for + # new logged problems + random.shuffle(file_list) + for item_path in file_list: + atp_code = item_path.replace(directory_path_with_unpacked_spider_data, "").replace('.geojson', "") + if atp_code in config.ignored_atp_codes(): + continue + yield item_path, atp_code diff --git a/dump_all_atp_in_area.py b/dump_all_atp_in_area.py index 3a043c4..2065538 100644 --- a/dump_all_atp_in_area.py +++ b/dump_all_atp_in_area.py @@ -6,8 +6,8 @@ import csv import json import nominatim import shared +import data_iterator -obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") bookmarks = __import__("5_generate_organic_map_bookmarks") @@ -21,7 +21,7 @@ def main(): } for area_name, area in areas.items(): collected = [] - for code, data in obtain_atp_data.spider_codes_iterator_with_data(): + for code, data in data_iterator.spider_codes_iterator_with_data(): print(code) for entry in data: if 'opening_hours' in entry['properties']: