1
0
Fork 0

move iterator to iterators

This commit is contained in:
Mateusz Konieczny 2025-02-19 04:17:42 +01:00
parent e7feeb0d02
commit a3fbe9f1d3
14 changed files with 64 additions and 70 deletions

View file

@ -5,7 +5,6 @@ import json
import nominatim
import shared
import data_iterator
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
graticule_report = __import__("5_generate_graticule_reports")
@ -44,7 +43,7 @@ def main():
missing_object_per_area[name] = []
missing_opening_hours_per_area[name] = []
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
for atp in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
if atp.match_distance != None and atp.match_distance < config.missing_shop_distance_in_kilometers_for_specific_case(atp.atp_tags):
# not considered as clearly missing

View file

@ -4,7 +4,6 @@ import os
import serializing
import show_data
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import shared
import wikidata
@ -126,7 +125,7 @@ def main():
matching_via_parentage = 0
no_longer_matching_at_all_according_to_matcher = 0
wikidata_mismatch_brand_match_by_claimed_wikidata_id_in_atp = defaultdict(list)
for atp_code in obtain_atp_data.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
for atp_code in data_iterator.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
if entry.match_distance != None:
if entry.match_distance < config.missing_shop_distance_in_kilometers_for_specific_case(entry.atp_tags):

View file

@ -1,7 +1,6 @@
import rich
import json
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
reported = {}
@ -32,7 +31,7 @@ def log_if_unhandled_cascading_found(tags):
def main():
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
continue
filename = config.atp_unpacked_folder() + atp_code + '.geojson'

View file

@ -2,7 +2,7 @@ import rich
import json
import qa
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
import data_iterator
def log_if_unhandled_closing_found(tags):
for key, value in tags.items():
@ -35,7 +35,7 @@ def log_if_unhandled_closing_found(tags):
def main():
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
continue
filename = config.atp_unpacked_folder() + atp_code + '.geojson'

View file

@ -11,7 +11,6 @@ import generate_html
import json
import opening_hours_parser
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import wikidata
import nominatim
@ -97,7 +96,7 @@ def get_import_listing_configuration_for_atp_spiders():
checked_keys_per_atp = {}
known_unavailable_listings = {'opening_hours': {}, 'website': {}}
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
if allowed_spider(atp_code) == False:
continue
checked_keys = []

View file

@ -1,5 +1,4 @@
import qa
import random
import shared
import matcher
import rich
@ -15,10 +14,11 @@ def main():
download_entire_atp_dataset()
find_missing_listing_of_commonly_shared_name_parts()
import data_iterator
def find_missing_listing_of_commonly_shared_name_parts():
name_parts_by_popularity = {}
for atp_code in all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
pure_code = remove_country_codes_from_spider_code(atp_code)
if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
# lists multiple fuel station brands, resulting in double counting
@ -61,47 +61,6 @@ def remove_country_codes_from_spider_code(atp_code):
return "_".join(returned_parts)
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
"""
this one is not parsing .geojson files so will be faster
"""
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
# TODO: there is no full match between spider codes and their filenames
# see https://github.com/alltheplaces/alltheplaces/issues/9687
file_list = []
for item in os.listdir(directory_path_with_unpacked_spider_data):
item_path = os.path.join(directory_path_with_unpacked_spider_data, item)
if os.path.isfile(item_path):
file_list.append(item_path)
# shuffling as having consistent order does not help
# and in cases where script reports error results in initial cases
# being fixed first and subsequent reruns waiting long for
# new logged problems
random.shuffle(file_list)
for item_path in file_list:
atp_code = item_path.replace(directory_path_with_unpacked_spider_data, "").replace('.geojson', "")
if atp_code in config.ignored_atp_codes():
continue
yield item_path, atp_code
def all_spider_codes_iterator():
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
yield atp_code
def spider_codes_iterator_with_data():
for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
with open(item_path, 'r') as file:
content = file.read()
try:
features = json.loads(content)["features"]
yield atp_code, features
except json.decoder.JSONDecodeError:
continue
def download_entire_atp_dataset():
FULL_ATP_FOLDER = config.atp_cache_folder()
if os.path.isdir(FULL_ATP_FOLDER) == False:

View file

@ -9,7 +9,7 @@ import leafleter.generator
import os
import math
import shutil
obtain_atp_data = __import__("2_obtain_atp_data")
import data_iterator
process_planet = __import__("4_process_planet_file")
config = __import__("0_config")
@ -225,7 +225,7 @@ def graticule_data_subpart_atp_data_merge_success_marker_filepath(area):
def prepare_atp_graticule_files(graticule_coverage):
atp_codes = list(obtain_atp_data.all_spider_codes_iterator())
atp_codes = list(data_iterator.all_spider_codes_iterator())
for index, atp_code in enumerate(atp_codes):
spider_data_split_success_marker_filepath = graticule_data_specific_spider_subpart_success_marker_filepath(graticule_coverage, atp_code)
if os.path.isfile(spider_data_split_success_marker_filepath):
@ -269,7 +269,7 @@ def split_specific_spider_across_graticules(graticule_coverage, atp_code):
raise
def merge_atp_graticule_files(area):
atp_codes = list(obtain_atp_data.all_spider_codes_iterator())
atp_codes = list(data_iterator.all_spider_codes_iterator())
delete_merged_graticule_files_for_atp_data_if_any(area)
for lat_anchor in range(area['min_lat'], area['max_lat']):
for lon_anchor in range(area['min_lon'], area['max_lon']):
@ -432,7 +432,7 @@ def generate_report_for_given_graticule_or_return_cache_if_present(area_name, ar
def generate_report_for_given_graticule(area_name, area, lat_anchor, lon_anchor):
generated_filepaths = []
report_generators = {}
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
potential_match_results = match_output_for_spider_and_graticule(area, atp_code, lat_anchor, lon_anchor)
if os.path.isfile(potential_match_results):
matched = serializing.load_list_of_matches_from_csv(potential_match_results)

View file

@ -6,7 +6,7 @@ import json
import os
import rich
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
obtain_atp_data = __import__("2_obtain_atp_data")
import data_iterator
qa = __import__("qa")
config = __import__("0_config")
@ -47,7 +47,7 @@ def main():
reports['repeated_for_atp_issue_tracker'][key] = ""
reports['repeated_machine_readable_for_config_updates'][key] = []
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
reports = process_atp(atp_code, reports)
show_reports(reports)

View file

@ -3,8 +3,8 @@ import rich
import serializing
import taginfo
import os
import data_iterator
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
graticule_report = __import__("5_generate_graticule_reports")
import data_iterator
@ -42,7 +42,7 @@ def collect_data():
used_keys = {}
used_tags = {}
area = graticule_report.global_graticule_coverage()
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
continue
for atp in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):

View file

@ -2,7 +2,6 @@ import os
import serializing
import show_data
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import data_iterator
@ -200,7 +199,7 @@ def nothing_to_report_marker_filepath(atp_code):
area = graticule_report.global_graticule_coverage()
skipped_as_on_ignore_list_or_empty = []
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in ignored_atp_codes():
skipped_as_on_ignore_list_or_empty.append(atp_code)
continue

View file

@ -3,7 +3,6 @@ import os
import serializing
import show_data
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import data_iterator
@ -15,7 +14,7 @@ def main():
print("has name from ATP that seems to mismatch what is in OSM, while brand matches well. Note, this is based on subset of ATP data matched to OSM using brand field! So at least in theory there could be many cases where `name` from ATP matches while `brand` mismatches. But from specific cases you see a lot of what is rather `branch` and similar.")
area = graticule_report.global_graticule_coverage()
for atp_code in obtain_atp_data.all_spider_codes_iterator():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in [
]:
continue

View file

@ -1,9 +1,8 @@
import qa
obtain_atp_data = __import__("2_obtain_atp_data")
import data_iterator
def main():
for atp_code, parsed_content in obtain_atp_data.spider_codes_iterator_with_data():
for atp_code, parsed_content in data_iterator.spider_codes_iterator_with_data():
for entry in parsed_content:
if entry['geometry'] == None:
# no point in bothering with such cases...

View file

@ -1,6 +1,9 @@
graticule_report = __import__("5_generate_graticule_reports")
import serializing
import os
import random
import json
config = __import__("0_config")
def iterate_over_all_matches_for_specific_spider(area, atp_code):
for lat_anchor in range(area['min_lat'], area['max_lat']):
@ -10,3 +13,42 @@ def iterate_over_all_matches_for_specific_spider(area, atp_code):
match_list = serializing.load_list_of_matches_from_csv(output)
for entry in match_list:
yield entry
def spider_codes_iterator_with_data():
for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
with open(item_path, 'r') as file:
content = file.read()
try:
features = json.loads(content)["features"]
yield atp_code, features
except json.decoder.JSONDecodeError:
continue
def all_spider_codes_iterator():
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
yield atp_code
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
"""
this one is not parsing .geojson files so will be faster
"""
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
# TODO: there is no full match between spider codes and their filenames
# see https://github.com/alltheplaces/alltheplaces/issues/9687
file_list = []
for item in os.listdir(directory_path_with_unpacked_spider_data):
item_path = os.path.join(directory_path_with_unpacked_spider_data, item)
if os.path.isfile(item_path):
file_list.append(item_path)
# shuffling as having consistent order does not help
# and in cases where script reports error results in initial cases
# being fixed first and subsequent reruns waiting long for
# new logged problems
random.shuffle(file_list)
for item_path in file_list:
atp_code = item_path.replace(directory_path_with_unpacked_spider_data, "").replace('.geojson', "")
if atp_code in config.ignored_atp_codes():
continue
yield item_path, atp_code

View file

@ -6,8 +6,8 @@ import csv
import json
import nominatim
import shared
import data_iterator
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
bookmarks = __import__("5_generate_organic_map_bookmarks")
@ -21,7 +21,7 @@ def main():
}
for area_name, area in areas.items():
collected = []
for code, data in obtain_atp_data.spider_codes_iterator_with_data():
for code, data in data_iterator.spider_codes_iterator_with_data():
print(code)
for entry in data:
if 'opening_hours' in entry['properties']: