mirror of
https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data.git
synced 2025-04-11 01:59:30 +02:00
move iterator to iterators
This commit is contained in:
parent
e7feeb0d02
commit
a3fbe9f1d3
14 changed files with 64 additions and 70 deletions
12_generate_organic_map_bookmarks.py17_list_mismatching_brand_wikidata.py19_detect_unhandled_cascading_values_for_canonical_poi_types.py20_detect_unhandled_closed_poi.py21_list_import_status.py2_obtain_atp_data.py5_generate_graticule_reports.py81_generate_atp_issue_tracker_report.py82_list_unusual_tags_present_in_atp.py83_generate_atp_issue_reports_about_poorly_matched_entries.py84_generate_atp_issue_reports_about_bad_names.py89_run_remove_bad_data_across_atp_to_trigger_log_output.pydata_iterator.pydump_all_atp_in_area.py
|
@ -5,7 +5,6 @@ import json
|
|||
import nominatim
|
||||
import shared
|
||||
import data_iterator
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
|
||||
|
@ -44,7 +43,7 @@ def main():
|
|||
missing_object_per_area[name] = []
|
||||
missing_opening_hours_per_area[name] = []
|
||||
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
for atp in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
|
||||
if atp.match_distance != None and atp.match_distance < config.missing_shop_distance_in_kilometers_for_specific_case(atp.atp_tags):
|
||||
# not considered as clearly missing
|
||||
|
|
|
@ -4,7 +4,6 @@ import os
|
|||
import serializing
|
||||
import show_data
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
import shared
|
||||
import wikidata
|
||||
|
@ -126,7 +125,7 @@ def main():
|
|||
matching_via_parentage = 0
|
||||
no_longer_matching_at_all_according_to_matcher = 0
|
||||
wikidata_mismatch_brand_match_by_claimed_wikidata_id_in_atp = defaultdict(list)
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
|
||||
for atp_code in data_iterator.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
|
||||
for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
|
||||
if entry.match_distance != None:
|
||||
if entry.match_distance < config.missing_shop_distance_in_kilometers_for_specific_case(entry.atp_tags):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import rich
|
||||
import json
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
|
||||
reported = {}
|
||||
|
||||
|
@ -32,7 +31,7 @@ def log_if_unhandled_cascading_found(tags):
|
|||
|
||||
|
||||
def main():
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
|
|
|
@ -2,7 +2,7 @@ import rich
|
|||
import json
|
||||
import qa
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
import data_iterator
|
||||
|
||||
def log_if_unhandled_closing_found(tags):
|
||||
for key, value in tags.items():
|
||||
|
@ -35,7 +35,7 @@ def log_if_unhandled_closing_found(tags):
|
|||
|
||||
|
||||
def main():
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
|
|
|
@ -11,7 +11,6 @@ import generate_html
|
|||
import json
|
||||
import opening_hours_parser
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
import wikidata
|
||||
import nominatim
|
||||
|
@ -97,7 +96,7 @@ def get_import_listing_configuration_for_atp_spiders():
|
|||
checked_keys_per_atp = {}
|
||||
known_unavailable_listings = {'opening_hours': {}, 'website': {}}
|
||||
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if allowed_spider(atp_code) == False:
|
||||
continue
|
||||
checked_keys = []
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import qa
|
||||
import random
|
||||
import shared
|
||||
import matcher
|
||||
import rich
|
||||
|
@ -15,10 +14,11 @@ def main():
|
|||
download_entire_atp_dataset()
|
||||
find_missing_listing_of_commonly_shared_name_parts()
|
||||
|
||||
import data_iterator
|
||||
|
||||
def find_missing_listing_of_commonly_shared_name_parts():
|
||||
name_parts_by_popularity = {}
|
||||
for atp_code in all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
pure_code = remove_country_codes_from_spider_code(atp_code)
|
||||
if pure_code in ['gov_cma_fuel', 'gov_osservaprezzi_carburanti', 'gov_goriva']:
|
||||
# lists multiple fuel station brands, resulting in double counting
|
||||
|
@ -61,47 +61,6 @@ def remove_country_codes_from_spider_code(atp_code):
|
|||
return "_".join(returned_parts)
|
||||
|
||||
|
||||
|
||||
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
"""
|
||||
this one is not parsing .geojson files so will be faster
|
||||
"""
|
||||
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
|
||||
# TODO: there is no full match between spider codes and their filenames
|
||||
# see https://github.com/alltheplaces/alltheplaces/issues/9687
|
||||
file_list = []
|
||||
for item in os.listdir(directory_path_with_unpacked_spider_data):
|
||||
item_path = os.path.join(directory_path_with_unpacked_spider_data, item)
|
||||
if os.path.isfile(item_path):
|
||||
file_list.append(item_path)
|
||||
# shuffling as having consistent order does not help
|
||||
# and in cases where script reports error results in initial cases
|
||||
# being fixed first and subsequent reruns waiting long for
|
||||
# new logged problems
|
||||
random.shuffle(file_list)
|
||||
for item_path in file_list:
|
||||
atp_code = item_path.replace(directory_path_with_unpacked_spider_data, "").replace('.geojson', "")
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
yield item_path, atp_code
|
||||
|
||||
|
||||
def all_spider_codes_iterator():
|
||||
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
yield atp_code
|
||||
|
||||
|
||||
def spider_codes_iterator_with_data():
|
||||
for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
with open(item_path, 'r') as file:
|
||||
content = file.read()
|
||||
try:
|
||||
features = json.loads(content)["features"]
|
||||
yield atp_code, features
|
||||
except json.decoder.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
def download_entire_atp_dataset():
|
||||
FULL_ATP_FOLDER = config.atp_cache_folder()
|
||||
if os.path.isdir(FULL_ATP_FOLDER) == False:
|
||||
|
|
|
@ -9,7 +9,7 @@ import leafleter.generator
|
|||
import os
|
||||
import math
|
||||
import shutil
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
import data_iterator
|
||||
process_planet = __import__("4_process_planet_file")
|
||||
config = __import__("0_config")
|
||||
|
||||
|
@ -225,7 +225,7 @@ def graticule_data_subpart_atp_data_merge_success_marker_filepath(area):
|
|||
|
||||
|
||||
def prepare_atp_graticule_files(graticule_coverage):
|
||||
atp_codes = list(obtain_atp_data.all_spider_codes_iterator())
|
||||
atp_codes = list(data_iterator.all_spider_codes_iterator())
|
||||
for index, atp_code in enumerate(atp_codes):
|
||||
spider_data_split_success_marker_filepath = graticule_data_specific_spider_subpart_success_marker_filepath(graticule_coverage, atp_code)
|
||||
if os.path.isfile(spider_data_split_success_marker_filepath):
|
||||
|
@ -269,7 +269,7 @@ def split_specific_spider_across_graticules(graticule_coverage, atp_code):
|
|||
raise
|
||||
|
||||
def merge_atp_graticule_files(area):
|
||||
atp_codes = list(obtain_atp_data.all_spider_codes_iterator())
|
||||
atp_codes = list(data_iterator.all_spider_codes_iterator())
|
||||
delete_merged_graticule_files_for_atp_data_if_any(area)
|
||||
for lat_anchor in range(area['min_lat'], area['max_lat']):
|
||||
for lon_anchor in range(area['min_lon'], area['max_lon']):
|
||||
|
@ -432,7 +432,7 @@ def generate_report_for_given_graticule_or_return_cache_if_present(area_name, ar
|
|||
def generate_report_for_given_graticule(area_name, area, lat_anchor, lon_anchor):
|
||||
generated_filepaths = []
|
||||
report_generators = {}
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
potential_match_results = match_output_for_spider_and_graticule(area, atp_code, lat_anchor, lon_anchor)
|
||||
if os.path.isfile(potential_match_results):
|
||||
matched = serializing.load_list_of_matches_from_csv(potential_match_results)
|
||||
|
|
|
@ -6,7 +6,7 @@ import json
|
|||
import os
|
||||
import rich
|
||||
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
import data_iterator
|
||||
qa = __import__("qa")
|
||||
config = __import__("0_config")
|
||||
|
||||
|
@ -47,7 +47,7 @@ def main():
|
|||
reports['repeated_for_atp_issue_tracker'][key] = ""
|
||||
reports['repeated_machine_readable_for_config_updates'][key] = []
|
||||
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
reports = process_atp(atp_code, reports)
|
||||
show_reports(reports)
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@ import rich
|
|||
import serializing
|
||||
import taginfo
|
||||
import os
|
||||
import data_iterator
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
import data_iterator
|
||||
|
||||
|
@ -42,7 +42,7 @@ def collect_data():
|
|||
used_keys = {}
|
||||
used_tags = {}
|
||||
area = graticule_report.global_graticule_coverage()
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
for atp in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import serializing
|
||||
import show_data
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
import data_iterator
|
||||
|
||||
|
@ -200,7 +199,7 @@ def nothing_to_report_marker_filepath(atp_code):
|
|||
|
||||
area = graticule_report.global_graticule_coverage()
|
||||
skipped_as_on_ignore_list_or_empty = []
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in ignored_atp_codes():
|
||||
skipped_as_on_ignore_list_or_empty.append(atp_code)
|
||||
continue
|
||||
|
|
|
@ -3,7 +3,6 @@ import os
|
|||
import serializing
|
||||
import show_data
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
import data_iterator
|
||||
|
||||
|
@ -15,7 +14,7 @@ def main():
|
|||
|
||||
print("has name from ATP that seems to mismatch what is in OSM, while brand matches well. Note, this is based on subset of ATP data matched to OSM using brand field! So at least in theory there could be many cases where `name` from ATP matches while `brand` mismatches. But from specific cases you see a lot of what is rather `branch` and similar.")
|
||||
area = graticule_report.global_graticule_coverage()
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in [
|
||||
]:
|
||||
continue
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
import qa
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
|
||||
import data_iterator
|
||||
|
||||
def main():
|
||||
for atp_code, parsed_content in obtain_atp_data.spider_codes_iterator_with_data():
|
||||
for atp_code, parsed_content in data_iterator.spider_codes_iterator_with_data():
|
||||
for entry in parsed_content:
|
||||
if entry['geometry'] == None:
|
||||
# no point in bothering with such cases...
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
import serializing
|
||||
import os
|
||||
import random
|
||||
import json
|
||||
config = __import__("0_config")
|
||||
|
||||
def iterate_over_all_matches_for_specific_spider(area, atp_code):
|
||||
for lat_anchor in range(area['min_lat'], area['max_lat']):
|
||||
|
@ -10,3 +13,42 @@ def iterate_over_all_matches_for_specific_spider(area, atp_code):
|
|||
match_list = serializing.load_list_of_matches_from_csv(output)
|
||||
for entry in match_list:
|
||||
yield entry
|
||||
|
||||
def spider_codes_iterator_with_data():
|
||||
for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
with open(item_path, 'r') as file:
|
||||
content = file.read()
|
||||
try:
|
||||
features = json.loads(content)["features"]
|
||||
yield atp_code, features
|
||||
except json.decoder.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
def all_spider_codes_iterator():
|
||||
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
yield atp_code
|
||||
|
||||
|
||||
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
"""
|
||||
this one is not parsing .geojson files so will be faster
|
||||
"""
|
||||
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
|
||||
# TODO: there is no full match between spider codes and their filenames
|
||||
# see https://github.com/alltheplaces/alltheplaces/issues/9687
|
||||
file_list = []
|
||||
for item in os.listdir(directory_path_with_unpacked_spider_data):
|
||||
item_path = os.path.join(directory_path_with_unpacked_spider_data, item)
|
||||
if os.path.isfile(item_path):
|
||||
file_list.append(item_path)
|
||||
# shuffling as having consistent order does not help
|
||||
# and in cases where script reports error results in initial cases
|
||||
# being fixed first and subsequent reruns waiting long for
|
||||
# new logged problems
|
||||
random.shuffle(file_list)
|
||||
for item_path in file_list:
|
||||
atp_code = item_path.replace(directory_path_with_unpacked_spider_data, "").replace('.geojson', "")
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
yield item_path, atp_code
|
||||
|
|
|
@ -6,8 +6,8 @@ import csv
|
|||
import json
|
||||
import nominatim
|
||||
import shared
|
||||
import data_iterator
|
||||
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
bookmarks = __import__("5_generate_organic_map_bookmarks")
|
||||
|
||||
|
@ -21,7 +21,7 @@ def main():
|
|||
}
|
||||
for area_name, area in areas.items():
|
||||
collected = []
|
||||
for code, data in obtain_atp_data.spider_codes_iterator_with_data():
|
||||
for code, data in data_iterator.spider_codes_iterator_with_data():
|
||||
print(code)
|
||||
for entry in data:
|
||||
if 'opening_hours' in entry['properties']:
|
||||
|
|
Loading…
Add table
Reference in a new issue