prepare to fetching old ATP datasets

2025-04-11 10:09:29 +02:00 · 2025-03-20 15:38:55 +01:00 · 2025-03-20 15:38:55 +01:00 · 6103e6f398
commit 6103e6f398
parent add8a753f9
11 changed files with 79 additions and 70 deletions
--- a/0_config.py
+++ b/0_config.py
@ -552,10 +552,6 @@ def atp_cache_folder():
    return cache_folder() + "entire_atp/"


-def atp_unpacked_folder():
-    return atp_cache_folder() + "output/"
-
-
 def build_storage_folder():
    return cache_folder() + "build_temporary_files/"

@ -6946,7 +6942,10 @@ def show_info_about_spider_to_debug_it(atp_code):
 def return_info_about_spider_to_debug_it(atp_code):
    returned = get_github_link_to_spider(atp_code)
    returned += "\n"
-    returned += atp_unpacked_folder() + atp_code + '.geojson'
+    # TODO: how to get data from 2_obtain_atp_data.py without looping dependencies
+    # maybe just drop line below?
+    # move atp accessing into separate file? with 2_obtain_atp_data becoming tiny wrapper?
+    returned += atp_cache_folder() + "????" + atp_code + '.geojson'
    return returned


--- a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py
+++ b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py
@ -1,7 +1,7 @@
 import rich
 import json
 config = __import__("0_config")
-
+obtain_atp_data = __import__("2_obtain_atp_data")
 reported = {}


@ -34,7 +34,7 @@ def main():
    for atp_code in data_iterator.all_spider_codes_iterator():
        if atp_code in config.ignored_atp_codes():
            continue
-        filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+        filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
        with open(filename) as file:
            try:
                atp_data = json.load(file)
--- a/20_detect_unhandled_closed_poi.py
+++ b/20_detect_unhandled_closed_poi.py
@ -2,6 +2,7 @@ import rich
 import json
 import qa
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")
 import data_iterator

 def log_if_unhandled_closing_found(tags):
@ -38,7 +39,7 @@ def main():
    for atp_code in data_iterator.all_spider_codes_iterator():
        if atp_code in config.ignored_atp_codes():
            continue
-        filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+        filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
        with open(filename) as file:
            try:
                atp_data = json.load(file)
--- a/21_list_import_status.py
+++ b/21_list_import_status.py
@ -12,6 +12,7 @@ import json
 import opening_hours_parser
 graticule_report = __import__("5_generate_graticule_reports")
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")
 import wikidata
 import nominatim
 import url_checker
@ -34,7 +35,7 @@ def skipped_osm_cases():

 def count_unique_website_links(atp_code):
    website_links = set()
-    source_atp_filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+    source_atp_filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
    with open(source_atp_filename) as file:
        try:
            atp_data = json.load(file)
--- a/2_obtain_atp_data.py
+++ b/2_obtain_atp_data.py
@ -3,35 +3,64 @@ import osm_bot_abstraction_layer.util_download_file
 import json
 import os
 import requests
+import simple_cache
+
 config = __import__("0_config")

+print(simple_cache.read_cache(config.atp_cache_folder() + 'atp_metadata_website_latest.cache'))

 def main():
-    download_entire_atp_dataset()
+    download_latest_atp_dataset()

-def download_entire_atp_dataset():
-    FULL_ATP_FOLDER = config.atp_cache_folder()
-    if os.path.isdir(FULL_ATP_FOLDER) == False:
-        os.makedirs(FULL_ATP_FOLDER)
-    if os.path.isdir(config.atp_unpacked_folder()) == False:
-        response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
-        run_id = response.json()['run_id']
-        print(run_id)
-        download_specific_atp_dataset(run_id)
+def caching_time():
+    day_in_seconds = 60 * 60 * 24
+    return 1 * day_in_seconds
+
+@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_history.cache", ttl=caching_time())
+def get_atp_history():
+    response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
+    return response.json()
+
+@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_latest.cache", ttl=caching_time())
+def get_atp_latest():
+    response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
+    return response.json()
+
+def latest_atp_unpacked_folder():
+    # not using get_atp_latest()['run_id']
+    # as it would result in situation where new ATP data gets published
+    # it is not fetched yet but script starts using it
+    # or data is different and code is mixing new ATP data with old processed data based on older ATP
+    while True:
+        for entry in get_atp_history()[::-1]:
+            run_id = entry['run_id']
+            candidate = atp_unpacked_folder(run_id)
+            if os.path.isdir() == True:
+                return candidate
+
+def atp_unpacked_folder(run_id):
+    return config.atp_cache_folder() + run_id + "/"
+
+def download_latest_atp_dataset():
+    response = get_atp_latest()
+    run_id = response['run_id']
+    download_specific_atp_run(run_id)
+
+def download_specific_atp_run(run_id):
+    folder_path = config.atp_cache_folder() + "/" + run_id + "/"
+    success_marker = folder_path + "atp_download_completed.success"
+
+    if os.path.isfile(success_marker) == False:
+        if os.path.isdir(folder_path) == True:
+            raise Exception(folder_path + " is in inconsistent state")
+
+    if os.path.isdir(folder_path) == False:
+        os.makedirs(folder_path)

-def download_specific_atp_dataset(run_id):
    download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
    filename = "entire_atp.zip"
-    osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
+    osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, folder_path, filename)
    os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')

-
-def download(code, run_id):
-    directory_path = config.cache_folder()
-    download_url = 'https://alltheplaces-data.openaddresses.io/runs/' + run_id + '/output/' + code + '.geojson'
-    filename = code + ".atp.geojson"
-    osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename)
-
-
 if __name__ == "__main__":
    main()
--- a/81_generate_atp_issue_tracker_report.py
+++ b/81_generate_atp_issue_tracker_report.py
@ -9,6 +9,7 @@ import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
 import data_iterator
 qa = __import__("qa")
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")

 # TODO_ATP
 # list closed objects detected by 20_detect_unhandled_closed_poi.py
@ -77,7 +78,7 @@ def show_reports(reports):


 def process_atp(atp_code, reports):
-    filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+    filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
    with open(filename) as file:
        try:
            data = json.load(file)
--- a/data_iterator.py
+++ b/data_iterator.py
@ -4,6 +4,7 @@ import os
 import random
 import json
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")

 def iterate_over_all_matches_for_specific_spider(area, atp_code):
    for lat_anchor in range(area['min_lat'], area['max_lat']):
@ -34,7 +35,7 @@ def spider_codes_and_filepaths_iterator_including_broken_data_ones():
    """
    this one is not parsing .geojson files so will be faster
    """
-    directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
+    directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
    # TODO: there is no full match between spider codes and their filenames
    # see https://github.com/alltheplaces/alltheplaces/issues/9687
    file_list = []
--- a/downloader_older_atp_data.py
+++ b/downloader_older_atp_data.py
@ -1,37 +0,0 @@
-import rich
-import osm_bot_abstraction_layer.util_download_file
-import os
-import requests
-config = __import__("0_config")
-
-def main():
-    # TODO: cache that
-    response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
-    rich.print(response.json())
-    for entry in response.json()[::-1]:
-        print(entry['run_id'])
-
-# https://data.alltheplaces.xyz/runs/history.json
-# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
-# zabka is unstable - count past entries
-# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
-# filter to ones in Kraków
-
-#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
-
-# TODO: copied existing code, modify it
-def download_entire_atp_dataset():
-    FULL_ATP_FOLDER = config.atp_cache_folder()
-    if os.path.isdir(FULL_ATP_FOLDER) == False:
-        os.makedirs(FULL_ATP_FOLDER)
-    if os.path.isdir(config.atp_unpacked_folder()) == False:
-        response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
-        run_id = response.json()['run_id']
-        print(run_id)
-        download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
-        filename = "entire_atp.zip"
-        osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
-        os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
-
-if __name__ == "__main__":
-    main()
--- a/matcher.py
+++ b/matcher.py
@ -9,7 +9,7 @@ import datetime
 import qa

 config = __import__("0_config")
-
+obtain_atp_data = __import__("2_obtain_atp_data")

 def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
    returned = []
@ -229,7 +229,7 @@ def load_and_clean_atp(atp_code):


 def open_atp_file(atp_code):
-    filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+    filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
    if os.path.isfile(filename) == False:
        print("there is no such file as", filename, "for spider", atp_code)
        return []
--- a/requirements.txt
+++ b/requirements.txt
@ -10,3 +10,4 @@ matplotlib
 python-dotenv
 libtorrent
 regex
+simple_cache
--- a/view_data_across_atp_datasets.py
+++ b/view_data_across_atp_datasets.py
@ -0,0 +1,13 @@
+import rich
+import osm_bot_abstraction_layer.util_download_file
+import os
+config = __import__("0_config")
+
+# https://data.alltheplaces.xyz/runs/history.json
+# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
+# zabka is unstable - count past entries
+# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
+# filter to ones in Kraków
+
+#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
+