diff --git a/0_config.py b/0_config.py index 251a106..377e0fd 100644 --- a/0_config.py +++ b/0_config.py @@ -552,10 +552,6 @@ def atp_cache_folder(): return cache_folder() + "entire_atp/" -def atp_unpacked_folder(): - return atp_cache_folder() + "output/" - - def build_storage_folder(): return cache_folder() + "build_temporary_files/" @@ -6946,7 +6942,10 @@ def show_info_about_spider_to_debug_it(atp_code): def return_info_about_spider_to_debug_it(atp_code): returned = get_github_link_to_spider(atp_code) returned += "\n" - returned += atp_unpacked_folder() + atp_code + '.geojson' + # TODO: how to get data from 2_obtain_atp_data.py without looping dependencies + # maybe just drop line below? + # move atp accessing into separate file? with 2_obtain_atp_data becoming tiny wrapper? + returned += atp_cache_folder() + "????" + atp_code + '.geojson' return returned diff --git a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py index 6a916ac..6ddfee4 100644 --- a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py +++ b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py @@ -1,7 +1,7 @@ import rich import json config = __import__("0_config") - +obtain_atp_data = __import__("2_obtain_atp_data") reported = {} @@ -34,7 +34,7 @@ def main(): for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in config.ignored_atp_codes(): continue - filename = config.atp_unpacked_folder() + atp_code + '.geojson' + filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson' with open(filename) as file: try: atp_data = json.load(file) diff --git a/20_detect_unhandled_closed_poi.py b/20_detect_unhandled_closed_poi.py index 6241dd5..cfe8443 100644 --- a/20_detect_unhandled_closed_poi.py +++ b/20_detect_unhandled_closed_poi.py @@ -2,6 +2,7 @@ import rich import json import qa config = __import__("0_config") +obtain_atp_data = __import__("2_obtain_atp_data") import data_iterator def log_if_unhandled_closing_found(tags): @@ -38,7 +39,7 @@ def main(): for atp_code in data_iterator.all_spider_codes_iterator(): if atp_code in config.ignored_atp_codes(): continue - filename = config.atp_unpacked_folder() + atp_code + '.geojson' + filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson' with open(filename) as file: try: atp_data = json.load(file) diff --git a/21_list_import_status.py b/21_list_import_status.py index db5ec71..d1c7f97 100644 --- a/21_list_import_status.py +++ b/21_list_import_status.py @@ -12,6 +12,7 @@ import json import opening_hours_parser graticule_report = __import__("5_generate_graticule_reports") config = __import__("0_config") +obtain_atp_data = __import__("2_obtain_atp_data") import wikidata import nominatim import url_checker @@ -34,7 +35,7 @@ def skipped_osm_cases(): def count_unique_website_links(atp_code): website_links = set() - source_atp_filename = config.atp_unpacked_folder() + atp_code + '.geojson' + source_atp_filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson' with open(source_atp_filename) as file: try: atp_data = json.load(file) diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py index 8211d9b..8ccd6d8 100644 --- a/2_obtain_atp_data.py +++ b/2_obtain_atp_data.py @@ -3,35 +3,64 @@ import osm_bot_abstraction_layer.util_download_file import json import os import requests +import simple_cache + config = __import__("0_config") +print(simple_cache.read_cache(config.atp_cache_folder() + 'atp_metadata_website_latest.cache')) def main(): - download_entire_atp_dataset() + download_latest_atp_dataset() -def download_entire_atp_dataset(): - FULL_ATP_FOLDER = config.atp_cache_folder() - if os.path.isdir(FULL_ATP_FOLDER) == False: - os.makedirs(FULL_ATP_FOLDER) - if os.path.isdir(config.atp_unpacked_folder()) == False: - response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10) - run_id = response.json()['run_id'] - print(run_id) - download_specific_atp_dataset(run_id) +def caching_time(): + day_in_seconds = 60 * 60 * 24 + return 1 * day_in_seconds + +@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_history.cache", ttl=caching_time()) +def get_atp_history(): + response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10) + return response.json() + +@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_latest.cache", ttl=caching_time()) +def get_atp_latest(): + response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10) + return response.json() + +def latest_atp_unpacked_folder(): + # not using get_atp_latest()['run_id'] + # as it would result in situation where new ATP data gets published + # it is not fetched yet but script starts using it + # or data is different and code is mixing new ATP data with old processed data based on older ATP + while True: + for entry in get_atp_history()[::-1]: + run_id = entry['run_id'] + candidate = atp_unpacked_folder(run_id) + if os.path.isdir() == True: + return candidate + +def atp_unpacked_folder(run_id): + return config.atp_cache_folder() + run_id + "/" + +def download_latest_atp_dataset(): + response = get_atp_latest() + run_id = response['run_id'] + download_specific_atp_run(run_id) + +def download_specific_atp_run(run_id): + folder_path = config.atp_cache_folder() + "/" + run_id + "/" + success_marker = folder_path + "atp_download_completed.success" + + if os.path.isfile(success_marker) == False: + if os.path.isdir(folder_path) == True: + raise Exception(folder_path + " is in inconsistent state") + + if os.path.isdir(folder_path) == False: + os.makedirs(folder_path) -def download_specific_atp_dataset(run_id): download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip" filename = "entire_atp.zip" - osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename) + osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, folder_path, filename) os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"') - -def download(code, run_id): - directory_path = config.cache_folder() - download_url = 'https://alltheplaces-data.openaddresses.io/runs/' + run_id + '/output/' + code + '.geojson' - filename = code + ".atp.geojson" - osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename) - - if __name__ == "__main__": main() diff --git a/81_generate_atp_issue_tracker_report.py b/81_generate_atp_issue_tracker_report.py index 565f936..bfb09c0 100644 --- a/81_generate_atp_issue_tracker_report.py +++ b/81_generate_atp_issue_tracker_report.py @@ -9,6 +9,7 @@ import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge import data_iterator qa = __import__("qa") config = __import__("0_config") +obtain_atp_data = __import__("2_obtain_atp_data") # TODO_ATP # list closed objects detected by 20_detect_unhandled_closed_poi.py @@ -77,7 +78,7 @@ def show_reports(reports): def process_atp(atp_code, reports): - filename = config.atp_unpacked_folder() + atp_code + '.geojson' + filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson' with open(filename) as file: try: data = json.load(file) diff --git a/data_iterator.py b/data_iterator.py index 7196ac2..7c72746 100644 --- a/data_iterator.py +++ b/data_iterator.py @@ -4,6 +4,7 @@ import os import random import json config = __import__("0_config") +obtain_atp_data = __import__("2_obtain_atp_data") def iterate_over_all_matches_for_specific_spider(area, atp_code): for lat_anchor in range(area['min_lat'], area['max_lat']): @@ -34,7 +35,7 @@ def spider_codes_and_filepaths_iterator_including_broken_data_ones(): """ this one is not parsing .geojson files so will be faster """ - directory_path_with_unpacked_spider_data = config.atp_unpacked_folder() + directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder() # TODO: there is no full match between spider codes and their filenames # see https://github.com/alltheplaces/alltheplaces/issues/9687 file_list = [] diff --git a/downloader_older_atp_data.py b/downloader_older_atp_data.py deleted file mode 100644 index 4821d49..0000000 --- a/downloader_older_atp_data.py +++ /dev/null @@ -1,37 +0,0 @@ -import rich -import osm_bot_abstraction_layer.util_download_file -import os -import requests -config = __import__("0_config") - -def main(): - # TODO: cache that - response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10) - rich.print(response.json()) - for entry in response.json()[::-1]: - print(entry['run_id']) - -# https://data.alltheplaces.xyz/runs/history.json -# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson -# zabka is unstable - count past entries -# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"} -# filter to ones in Kraków - -#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename) - -# TODO: copied existing code, modify it -def download_entire_atp_dataset(): - FULL_ATP_FOLDER = config.atp_cache_folder() - if os.path.isdir(FULL_ATP_FOLDER) == False: - os.makedirs(FULL_ATP_FOLDER) - if os.path.isdir(config.atp_unpacked_folder()) == False: - response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10) - run_id = response.json()['run_id'] - print(run_id) - download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip" - filename = "entire_atp.zip" - osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename) - os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"') - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/matcher.py b/matcher.py index 55b77c4..d572527 100644 --- a/matcher.py +++ b/matcher.py @@ -9,7 +9,7 @@ import datetime import qa config = __import__("0_config") - +obtain_atp_data = __import__("2_obtain_atp_data") def filter_osm_data_with_dict(current_osm, osm_data_tag_filter): returned = [] @@ -229,7 +229,7 @@ def load_and_clean_atp(atp_code): def open_atp_file(atp_code): - filename = config.atp_unpacked_folder() + atp_code + '.geojson' + filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson' if os.path.isfile(filename) == False: print("there is no such file as", filename, "for spider", atp_code) return [] diff --git a/requirements.txt b/requirements.txt index 46e056d..a51443c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ matplotlib python-dotenv libtorrent regex +simple_cache diff --git a/view_data_across_atp_datasets.py b/view_data_across_atp_datasets.py new file mode 100644 index 0000000..329b364 --- /dev/null +++ b/view_data_across_atp_datasets.py @@ -0,0 +1,13 @@ +import rich +import osm_bot_abstraction_layer.util_download_file +import os +config = __import__("0_config") + +# https://data.alltheplaces.xyz/runs/history.json +# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson +# zabka is unstable - count past entries +# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"} +# filter to ones in Kraków + +#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename) +