From 6103e6f398068e2b48bbe2f128989b134b60bd5f Mon Sep 17 00:00:00 2001
From: Mateusz Konieczny <matkoniecz@gmail.com>
Date: Thu, 20 Mar 2025 15:38:55 +0100
Subject: [PATCH] prepare to fetching old ATP datasets

---
 0_config.py                                   |  9 ++-
 ...ascading_values_for_canonical_poi_types.py |  4 +-
 20_detect_unhandled_closed_poi.py             |  3 +-
 21_list_import_status.py                      |  3 +-
 2_obtain_atp_data.py                          | 69 +++++++++++++------
 81_generate_atp_issue_tracker_report.py       |  3 +-
 data_iterator.py                              |  3 +-
 downloader_older_atp_data.py                  | 37 ----------
 matcher.py                                    |  4 +-
 requirements.txt                              |  1 +
 view_data_across_atp_datasets.py              | 13 ++++
 11 files changed, 79 insertions(+), 70 deletions(-)
 delete mode 100644 downloader_older_atp_data.py
 create mode 100644 view_data_across_atp_datasets.py

diff --git a/0_config.py b/0_config.py
index 251a106..377e0fd 100644
--- a/0_config.py
+++ b/0_config.py
@@ -552,10 +552,6 @@ def atp_cache_folder():
     return cache_folder() + "entire_atp/"
 
 
-def atp_unpacked_folder():
-    return atp_cache_folder() + "output/"
-
-
 def build_storage_folder():
     return cache_folder() + "build_temporary_files/"
 
@@ -6946,7 +6942,10 @@ def show_info_about_spider_to_debug_it(atp_code):
 def return_info_about_spider_to_debug_it(atp_code):
     returned = get_github_link_to_spider(atp_code)
     returned += "\n"
-    returned += atp_unpacked_folder() + atp_code + '.geojson'
+    # TODO: how to get data from 2_obtain_atp_data.py without looping dependencies
+    # maybe just drop line below?
+    # move atp accessing into separate file? with 2_obtain_atp_data becoming tiny wrapper?
+    returned += atp_cache_folder() + "????" + atp_code + '.geojson'
     return returned
 
 
diff --git a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py
index 6a916ac..6ddfee4 100644
--- a/19_detect_unhandled_cascading_values_for_canonical_poi_types.py
+++ b/19_detect_unhandled_cascading_values_for_canonical_poi_types.py
@@ -1,7 +1,7 @@
 import rich
 import json
 config = __import__("0_config")
-
+obtain_atp_data = __import__("2_obtain_atp_data")
 reported = {}
 
 
@@ -34,7 +34,7 @@ def main():
     for atp_code in data_iterator.all_spider_codes_iterator():
         if atp_code in config.ignored_atp_codes():
             continue
-        filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+        filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
         with open(filename) as file:
             try:
                 atp_data = json.load(file)
diff --git a/20_detect_unhandled_closed_poi.py b/20_detect_unhandled_closed_poi.py
index 6241dd5..cfe8443 100644
--- a/20_detect_unhandled_closed_poi.py
+++ b/20_detect_unhandled_closed_poi.py
@@ -2,6 +2,7 @@ import rich
 import json
 import qa
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")
 import data_iterator
 
 def log_if_unhandled_closing_found(tags):
@@ -38,7 +39,7 @@ def main():
     for atp_code in data_iterator.all_spider_codes_iterator():
         if atp_code in config.ignored_atp_codes():
             continue
-        filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+        filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
         with open(filename) as file:
             try:
                 atp_data = json.load(file)
diff --git a/21_list_import_status.py b/21_list_import_status.py
index db5ec71..d1c7f97 100644
--- a/21_list_import_status.py
+++ b/21_list_import_status.py
@@ -12,6 +12,7 @@ import json
 import opening_hours_parser
 graticule_report = __import__("5_generate_graticule_reports")
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")
 import wikidata
 import nominatim
 import url_checker
@@ -34,7 +35,7 @@ def skipped_osm_cases():
 
 def count_unique_website_links(atp_code):
     website_links = set()
-    source_atp_filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+    source_atp_filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
     with open(source_atp_filename) as file:
         try:
             atp_data = json.load(file)
diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py
index 8211d9b..8ccd6d8 100644
--- a/2_obtain_atp_data.py
+++ b/2_obtain_atp_data.py
@@ -3,35 +3,64 @@ import osm_bot_abstraction_layer.util_download_file
 import json
 import os
 import requests
+import simple_cache
+
 config = __import__("0_config")
 
+print(simple_cache.read_cache(config.atp_cache_folder() + 'atp_metadata_website_latest.cache'))
 
 def main():
-    download_entire_atp_dataset()
+    download_latest_atp_dataset()
 
-def download_entire_atp_dataset():
-    FULL_ATP_FOLDER = config.atp_cache_folder()
-    if os.path.isdir(FULL_ATP_FOLDER) == False:
-        os.makedirs(FULL_ATP_FOLDER)
-    if os.path.isdir(config.atp_unpacked_folder()) == False:
-        response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
-        run_id = response.json()['run_id']
-        print(run_id)
-        download_specific_atp_dataset(run_id)
+def caching_time():
+    day_in_seconds = 60 * 60 * 24
+    return 1 * day_in_seconds
+
+@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_history.cache", ttl=caching_time())
+def get_atp_history():
+    response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
+    return response.json()
+
+@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_latest.cache", ttl=caching_time())
+def get_atp_latest():
+    response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
+    return response.json()
+
+def latest_atp_unpacked_folder():
+    # not using get_atp_latest()['run_id']
+    # as it would result in situation where new ATP data gets published
+    # it is not fetched yet but script starts using it
+    # or data is different and code is mixing new ATP data with old processed data based on older ATP
+    while True:
+        for entry in get_atp_history()[::-1]:
+            run_id = entry['run_id']
+            candidate = atp_unpacked_folder(run_id)
+            if os.path.isdir() == True:
+                return candidate
+
+def atp_unpacked_folder(run_id):
+    return config.atp_cache_folder() + run_id + "/"
+
+def download_latest_atp_dataset():
+    response = get_atp_latest()
+    run_id = response['run_id']
+    download_specific_atp_run(run_id)
+
+def download_specific_atp_run(run_id):
+    folder_path = config.atp_cache_folder() + "/" + run_id + "/"
+    success_marker = folder_path + "atp_download_completed.success"
+
+    if os.path.isfile(success_marker) == False:
+        if os.path.isdir(folder_path) == True:
+            raise Exception(folder_path + " is in inconsistent state")
+
+    if os.path.isdir(folder_path) == False:
+        os.makedirs(folder_path)
 
-def download_specific_atp_dataset(run_id):
     download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
     filename = "entire_atp.zip"
-    osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
+    osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, folder_path, filename)
     os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
 
-
-def download(code, run_id):
-    directory_path = config.cache_folder()
-    download_url = 'https://alltheplaces-data.openaddresses.io/runs/' + run_id + '/output/' + code + '.geojson'
-    filename = code + ".atp.geojson"
-    osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename)
-
-
 if __name__ == "__main__":
     main()
diff --git a/81_generate_atp_issue_tracker_report.py b/81_generate_atp_issue_tracker_report.py
index 565f936..bfb09c0 100644
--- a/81_generate_atp_issue_tracker_report.py
+++ b/81_generate_atp_issue_tracker_report.py
@@ -9,6 +9,7 @@ import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
 import data_iterator
 qa = __import__("qa")
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")
 
 # TODO_ATP
 # list closed objects detected by 20_detect_unhandled_closed_poi.py
@@ -77,7 +78,7 @@ def show_reports(reports):
 
 
 def process_atp(atp_code, reports):
-    filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+    filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
     with open(filename) as file:
         try:
             data = json.load(file)
diff --git a/data_iterator.py b/data_iterator.py
index 7196ac2..7c72746 100644
--- a/data_iterator.py
+++ b/data_iterator.py
@@ -4,6 +4,7 @@ import os
 import random
 import json
 config = __import__("0_config")
+obtain_atp_data = __import__("2_obtain_atp_data")
 
 def iterate_over_all_matches_for_specific_spider(area, atp_code):
     for lat_anchor in range(area['min_lat'], area['max_lat']):
@@ -34,7 +35,7 @@ def spider_codes_and_filepaths_iterator_including_broken_data_ones():
     """
     this one is not parsing .geojson files so will be faster
     """
-    directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
+    directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
     # TODO: there is no full match between spider codes and their filenames
     # see https://github.com/alltheplaces/alltheplaces/issues/9687
     file_list = []
diff --git a/downloader_older_atp_data.py b/downloader_older_atp_data.py
deleted file mode 100644
index 4821d49..0000000
--- a/downloader_older_atp_data.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import rich
-import osm_bot_abstraction_layer.util_download_file
-import os
-import requests
-config = __import__("0_config")
-
-def main():
-    # TODO: cache that
-    response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
-    rich.print(response.json())
-    for entry in response.json()[::-1]:
-        print(entry['run_id'])
-
-# https://data.alltheplaces.xyz/runs/history.json
-# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
-# zabka is unstable - count past entries
-# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
-# filter to ones in Kraków
-
-#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
-
-# TODO: copied existing code, modify it
-def download_entire_atp_dataset():
-    FULL_ATP_FOLDER = config.atp_cache_folder()
-    if os.path.isdir(FULL_ATP_FOLDER) == False:
-        os.makedirs(FULL_ATP_FOLDER)
-    if os.path.isdir(config.atp_unpacked_folder()) == False:
-        response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
-        run_id = response.json()['run_id']
-        print(run_id)
-        download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
-        filename = "entire_atp.zip"
-        osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
-        os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/matcher.py b/matcher.py
index 55b77c4..d572527 100644
--- a/matcher.py
+++ b/matcher.py
@@ -9,7 +9,7 @@ import datetime
 import qa
 
 config = __import__("0_config")
-
+obtain_atp_data = __import__("2_obtain_atp_data")
 
 def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
     returned = []
@@ -229,7 +229,7 @@ def load_and_clean_atp(atp_code):
 
 
 def open_atp_file(atp_code):
-    filename = config.atp_unpacked_folder() + atp_code + '.geojson'
+    filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
     if os.path.isfile(filename) == False:
         print("there is no such file as", filename, "for spider", atp_code)
         return []
diff --git a/requirements.txt b/requirements.txt
index 46e056d..a51443c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ matplotlib
 python-dotenv
 libtorrent
 regex
+simple_cache
diff --git a/view_data_across_atp_datasets.py b/view_data_across_atp_datasets.py
new file mode 100644
index 0000000..329b364
--- /dev/null
+++ b/view_data_across_atp_datasets.py
@@ -0,0 +1,13 @@
+import rich
+import osm_bot_abstraction_layer.util_download_file
+import os
+config = __import__("0_config")
+
+# https://data.alltheplaces.xyz/runs/history.json
+# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
+# zabka is unstable - count past entries
+# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
+# filter to ones in Kraków
+
+#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
+