1
0
Fork 0

prepare to fetching old ATP datasets

This commit is contained in:
Mateusz Konieczny 2025-03-20 15:38:55 +01:00
parent add8a753f9
commit 6103e6f398
11 changed files with 79 additions and 70 deletions

View file

@ -552,10 +552,6 @@ def atp_cache_folder():
return cache_folder() + "entire_atp/"
def atp_unpacked_folder():
return atp_cache_folder() + "output/"
def build_storage_folder():
return cache_folder() + "build_temporary_files/"
@ -6946,7 +6942,10 @@ def show_info_about_spider_to_debug_it(atp_code):
def return_info_about_spider_to_debug_it(atp_code):
returned = get_github_link_to_spider(atp_code)
returned += "\n"
returned += atp_unpacked_folder() + atp_code + '.geojson'
# TODO: how to get data from 2_obtain_atp_data.py without looping dependencies
# maybe just drop line below?
# move atp accessing into separate file? with 2_obtain_atp_data becoming tiny wrapper?
returned += atp_cache_folder() + "????" + atp_code + '.geojson'
return returned

View file

@ -1,7 +1,7 @@
import rich
import json
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
reported = {}
@ -34,7 +34,7 @@ def main():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
continue
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
with open(filename) as file:
try:
atp_data = json.load(file)

View file

@ -2,6 +2,7 @@ import rich
import json
import qa
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
import data_iterator
def log_if_unhandled_closing_found(tags):
@ -38,7 +39,7 @@ def main():
for atp_code in data_iterator.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
continue
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
with open(filename) as file:
try:
atp_data = json.load(file)

View file

@ -12,6 +12,7 @@ import json
import opening_hours_parser
graticule_report = __import__("5_generate_graticule_reports")
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
import wikidata
import nominatim
import url_checker
@ -34,7 +35,7 @@ def skipped_osm_cases():
def count_unique_website_links(atp_code):
website_links = set()
source_atp_filename = config.atp_unpacked_folder() + atp_code + '.geojson'
source_atp_filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
with open(source_atp_filename) as file:
try:
atp_data = json.load(file)

View file

@ -3,35 +3,64 @@ import osm_bot_abstraction_layer.util_download_file
import json
import os
import requests
import simple_cache
config = __import__("0_config")
print(simple_cache.read_cache(config.atp_cache_folder() + 'atp_metadata_website_latest.cache'))
def main():
download_entire_atp_dataset()
download_latest_atp_dataset()
def download_entire_atp_dataset():
FULL_ATP_FOLDER = config.atp_cache_folder()
if os.path.isdir(FULL_ATP_FOLDER) == False:
os.makedirs(FULL_ATP_FOLDER)
if os.path.isdir(config.atp_unpacked_folder()) == False:
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
run_id = response.json()['run_id']
print(run_id)
download_specific_atp_dataset(run_id)
def caching_time():
day_in_seconds = 60 * 60 * 24
return 1 * day_in_seconds
@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_history.cache", ttl=caching_time())
def get_atp_history():
response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
return response.json()
@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_latest.cache", ttl=caching_time())
def get_atp_latest():
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
return response.json()
def latest_atp_unpacked_folder():
# not using get_atp_latest()['run_id']
# as it would result in situation where new ATP data gets published
# it is not fetched yet but script starts using it
# or data is different and code is mixing new ATP data with old processed data based on older ATP
while True:
for entry in get_atp_history()[::-1]:
run_id = entry['run_id']
candidate = atp_unpacked_folder(run_id)
if os.path.isdir() == True:
return candidate
def atp_unpacked_folder(run_id):
return config.atp_cache_folder() + run_id + "/"
def download_latest_atp_dataset():
response = get_atp_latest()
run_id = response['run_id']
download_specific_atp_run(run_id)
def download_specific_atp_run(run_id):
folder_path = config.atp_cache_folder() + "/" + run_id + "/"
success_marker = folder_path + "atp_download_completed.success"
if os.path.isfile(success_marker) == False:
if os.path.isdir(folder_path) == True:
raise Exception(folder_path + " is in inconsistent state")
if os.path.isdir(folder_path) == False:
os.makedirs(folder_path)
def download_specific_atp_dataset(run_id):
download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
filename = "entire_atp.zip"
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, folder_path, filename)
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
def download(code, run_id):
directory_path = config.cache_folder()
download_url = 'https://alltheplaces-data.openaddresses.io/runs/' + run_id + '/output/' + code + '.geojson'
filename = code + ".atp.geojson"
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename)
if __name__ == "__main__":
main()

View file

@ -9,6 +9,7 @@ import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
import data_iterator
qa = __import__("qa")
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
# TODO_ATP
# list closed objects detected by 20_detect_unhandled_closed_poi.py
@ -77,7 +78,7 @@ def show_reports(reports):
def process_atp(atp_code, reports):
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
with open(filename) as file:
try:
data = json.load(file)

View file

@ -4,6 +4,7 @@ import os
import random
import json
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
def iterate_over_all_matches_for_specific_spider(area, atp_code):
for lat_anchor in range(area['min_lat'], area['max_lat']):
@ -34,7 +35,7 @@ def spider_codes_and_filepaths_iterator_including_broken_data_ones():
"""
this one is not parsing .geojson files so will be faster
"""
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
# TODO: there is no full match between spider codes and their filenames
# see https://github.com/alltheplaces/alltheplaces/issues/9687
file_list = []

View file

@ -1,37 +0,0 @@
import rich
import osm_bot_abstraction_layer.util_download_file
import os
import requests
config = __import__("0_config")
def main():
# TODO: cache that
response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
rich.print(response.json())
for entry in response.json()[::-1]:
print(entry['run_id'])
# https://data.alltheplaces.xyz/runs/history.json
# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
# zabka is unstable - count past entries
# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
# filter to ones in Kraków
#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
# TODO: copied existing code, modify it
def download_entire_atp_dataset():
FULL_ATP_FOLDER = config.atp_cache_folder()
if os.path.isdir(FULL_ATP_FOLDER) == False:
os.makedirs(FULL_ATP_FOLDER)
if os.path.isdir(config.atp_unpacked_folder()) == False:
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
run_id = response.json()['run_id']
print(run_id)
download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
filename = "entire_atp.zip"
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
if __name__ == "__main__":
main()

View file

@ -9,7 +9,7 @@ import datetime
import qa
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
returned = []
@ -229,7 +229,7 @@ def load_and_clean_atp(atp_code):
def open_atp_file(atp_code):
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
if os.path.isfile(filename) == False:
print("there is no such file as", filename, "for spider", atp_code)
return []

View file

@ -10,3 +10,4 @@ matplotlib
python-dotenv
libtorrent
regex
simple_cache

View file

@ -0,0 +1,13 @@
import rich
import osm_bot_abstraction_layer.util_download_file
import os
config = __import__("0_config")
# https://data.alltheplaces.xyz/runs/history.json
# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
# zabka is unstable - count past entries
# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
# filter to ones in Kraków
#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)