1
0
Fork 0

first step in downloading past ATP data

This commit is contained in:
Mateusz Konieczny 2025-03-07 05:01:24 +01:00
parent 05e8565893
commit a489b32546

View file

@ -0,0 +1,37 @@
import rich
import osm_bot_abstraction_layer.util_download_file
import os
import requests
config = __import__("0_config")
def main():
# TODO: cache that
response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
rich.print(response.json())
for entry in response.json()[::-1]:
print(entry['run_id'])
# https://data.alltheplaces.xyz/runs/history.json
# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
# zabka is unstable - count past entries
# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
# filter to ones in Kraków
#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
# TODO: copied existing code, modify it
def download_entire_atp_dataset():
FULL_ATP_FOLDER = config.atp_cache_folder()
if os.path.isdir(FULL_ATP_FOLDER) == False:
os.makedirs(FULL_ATP_FOLDER)
if os.path.isdir(config.atp_unpacked_folder()) == False:
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
run_id = response.json()['run_id']
print(run_id)
download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
filename = "entire_atp.zip"
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
if __name__ == "__main__":
main()