mirror of
https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data.git
synced 2025-04-11 10:09:29 +02:00
prepare to fetching old ATP datasets
This commit is contained in:
parent
add8a753f9
commit
6103e6f398
11 changed files with 79 additions and 70 deletions
|
@ -552,10 +552,6 @@ def atp_cache_folder():
|
|||
return cache_folder() + "entire_atp/"
|
||||
|
||||
|
||||
def atp_unpacked_folder():
|
||||
return atp_cache_folder() + "output/"
|
||||
|
||||
|
||||
def build_storage_folder():
|
||||
return cache_folder() + "build_temporary_files/"
|
||||
|
||||
|
@ -6946,7 +6942,10 @@ def show_info_about_spider_to_debug_it(atp_code):
|
|||
def return_info_about_spider_to_debug_it(atp_code):
|
||||
returned = get_github_link_to_spider(atp_code)
|
||||
returned += "\n"
|
||||
returned += atp_unpacked_folder() + atp_code + '.geojson'
|
||||
# TODO: how to get data from 2_obtain_atp_data.py without looping dependencies
|
||||
# maybe just drop line below?
|
||||
# move atp accessing into separate file? with 2_obtain_atp_data becoming tiny wrapper?
|
||||
returned += atp_cache_folder() + "????" + atp_code + '.geojson'
|
||||
return returned
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import rich
|
||||
import json
|
||||
config = __import__("0_config")
|
||||
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
reported = {}
|
||||
|
||||
|
||||
|
@ -34,7 +34,7 @@ def main():
|
|||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
|
||||
with open(filename) as file:
|
||||
try:
|
||||
atp_data = json.load(file)
|
||||
|
|
|
@ -2,6 +2,7 @@ import rich
|
|||
import json
|
||||
import qa
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
import data_iterator
|
||||
|
||||
def log_if_unhandled_closing_found(tags):
|
||||
|
@ -38,7 +39,7 @@ def main():
|
|||
for atp_code in data_iterator.all_spider_codes_iterator():
|
||||
if atp_code in config.ignored_atp_codes():
|
||||
continue
|
||||
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
|
||||
with open(filename) as file:
|
||||
try:
|
||||
atp_data = json.load(file)
|
||||
|
|
|
@ -12,6 +12,7 @@ import json
|
|||
import opening_hours_parser
|
||||
graticule_report = __import__("5_generate_graticule_reports")
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
import wikidata
|
||||
import nominatim
|
||||
import url_checker
|
||||
|
@ -34,7 +35,7 @@ def skipped_osm_cases():
|
|||
|
||||
def count_unique_website_links(atp_code):
|
||||
website_links = set()
|
||||
source_atp_filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
source_atp_filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
|
||||
with open(source_atp_filename) as file:
|
||||
try:
|
||||
atp_data = json.load(file)
|
||||
|
|
|
@ -3,35 +3,64 @@ import osm_bot_abstraction_layer.util_download_file
|
|||
import json
|
||||
import os
|
||||
import requests
|
||||
import simple_cache
|
||||
|
||||
config = __import__("0_config")
|
||||
|
||||
print(simple_cache.read_cache(config.atp_cache_folder() + 'atp_metadata_website_latest.cache'))
|
||||
|
||||
def main():
|
||||
download_entire_atp_dataset()
|
||||
download_latest_atp_dataset()
|
||||
|
||||
def download_entire_atp_dataset():
|
||||
FULL_ATP_FOLDER = config.atp_cache_folder()
|
||||
if os.path.isdir(FULL_ATP_FOLDER) == False:
|
||||
os.makedirs(FULL_ATP_FOLDER)
|
||||
if os.path.isdir(config.atp_unpacked_folder()) == False:
|
||||
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
|
||||
run_id = response.json()['run_id']
|
||||
print(run_id)
|
||||
download_specific_atp_dataset(run_id)
|
||||
def caching_time():
|
||||
day_in_seconds = 60 * 60 * 24
|
||||
return 1 * day_in_seconds
|
||||
|
||||
@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_history.cache", ttl=caching_time())
|
||||
def get_atp_history():
|
||||
response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
|
||||
return response.json()
|
||||
|
||||
@simple_cache.cache_it(filename=config.atp_cache_folder() + "atp_metadata_website_latest.cache", ttl=caching_time())
|
||||
def get_atp_latest():
|
||||
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
|
||||
return response.json()
|
||||
|
||||
def latest_atp_unpacked_folder():
|
||||
# not using get_atp_latest()['run_id']
|
||||
# as it would result in situation where new ATP data gets published
|
||||
# it is not fetched yet but script starts using it
|
||||
# or data is different and code is mixing new ATP data with old processed data based on older ATP
|
||||
while True:
|
||||
for entry in get_atp_history()[::-1]:
|
||||
run_id = entry['run_id']
|
||||
candidate = atp_unpacked_folder(run_id)
|
||||
if os.path.isdir() == True:
|
||||
return candidate
|
||||
|
||||
def atp_unpacked_folder(run_id):
|
||||
return config.atp_cache_folder() + run_id + "/"
|
||||
|
||||
def download_latest_atp_dataset():
|
||||
response = get_atp_latest()
|
||||
run_id = response['run_id']
|
||||
download_specific_atp_run(run_id)
|
||||
|
||||
def download_specific_atp_run(run_id):
|
||||
folder_path = config.atp_cache_folder() + "/" + run_id + "/"
|
||||
success_marker = folder_path + "atp_download_completed.success"
|
||||
|
||||
if os.path.isfile(success_marker) == False:
|
||||
if os.path.isdir(folder_path) == True:
|
||||
raise Exception(folder_path + " is in inconsistent state")
|
||||
|
||||
if os.path.isdir(folder_path) == False:
|
||||
os.makedirs(folder_path)
|
||||
|
||||
def download_specific_atp_dataset(run_id):
|
||||
download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
|
||||
filename = "entire_atp.zip"
|
||||
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
|
||||
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, folder_path, filename)
|
||||
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
|
||||
|
||||
|
||||
def download(code, run_id):
|
||||
directory_path = config.cache_folder()
|
||||
download_url = 'https://alltheplaces-data.openaddresses.io/runs/' + run_id + '/output/' + code + '.geojson'
|
||||
filename = code + ".atp.geojson"
|
||||
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -9,6 +9,7 @@ import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
|
|||
import data_iterator
|
||||
qa = __import__("qa")
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
|
||||
# TODO_ATP
|
||||
# list closed objects detected by 20_detect_unhandled_closed_poi.py
|
||||
|
@ -77,7 +78,7 @@ def show_reports(reports):
|
|||
|
||||
|
||||
def process_atp(atp_code, reports):
|
||||
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
|
||||
with open(filename) as file:
|
||||
try:
|
||||
data = json.load(file)
|
||||
|
|
|
@ -4,6 +4,7 @@ import os
|
|||
import random
|
||||
import json
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
|
||||
def iterate_over_all_matches_for_specific_spider(area, atp_code):
|
||||
for lat_anchor in range(area['min_lat'], area['max_lat']):
|
||||
|
@ -34,7 +35,7 @@ def spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
|||
"""
|
||||
this one is not parsing .geojson files so will be faster
|
||||
"""
|
||||
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
|
||||
directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
|
||||
# TODO: there is no full match between spider codes and their filenames
|
||||
# see https://github.com/alltheplaces/alltheplaces/issues/9687
|
||||
file_list = []
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
import rich
|
||||
import osm_bot_abstraction_layer.util_download_file
|
||||
import os
|
||||
import requests
|
||||
config = __import__("0_config")
|
||||
|
||||
def main():
|
||||
# TODO: cache that
|
||||
response = requests.get("https://data.alltheplaces.xyz/runs/history.json", timeout=10)
|
||||
rich.print(response.json())
|
||||
for entry in response.json()[::-1]:
|
||||
print(entry['run_id'])
|
||||
|
||||
# https://data.alltheplaces.xyz/runs/history.json
|
||||
# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
|
||||
# zabka is unstable - count past entries
|
||||
# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
|
||||
# filter to ones in Kraków
|
||||
|
||||
#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
|
||||
|
||||
# TODO: copied existing code, modify it
|
||||
def download_entire_atp_dataset():
|
||||
FULL_ATP_FOLDER = config.atp_cache_folder()
|
||||
if os.path.isdir(FULL_ATP_FOLDER) == False:
|
||||
os.makedirs(FULL_ATP_FOLDER)
|
||||
if os.path.isdir(config.atp_unpacked_folder()) == False:
|
||||
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json", timeout=10)
|
||||
run_id = response.json()['run_id']
|
||||
print(run_id)
|
||||
download_url = "https://alltheplaces-data.openaddresses.io/runs/" + run_id + "/output.zip"
|
||||
filename = "entire_atp.zip"
|
||||
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
|
||||
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -9,7 +9,7 @@ import datetime
|
|||
import qa
|
||||
|
||||
config = __import__("0_config")
|
||||
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
|
||||
def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
|
||||
returned = []
|
||||
|
@ -229,7 +229,7 @@ def load_and_clean_atp(atp_code):
|
|||
|
||||
|
||||
def open_atp_file(atp_code):
|
||||
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
|
||||
filename = obtain_atp_data.latest_atp_unpacked_folder() + atp_code + '.geojson'
|
||||
if os.path.isfile(filename) == False:
|
||||
print("there is no such file as", filename, "for spider", atp_code)
|
||||
return []
|
||||
|
|
|
@ -10,3 +10,4 @@ matplotlib
|
|||
python-dotenv
|
||||
libtorrent
|
||||
regex
|
||||
simple_cache
|
||||
|
|
13
view_data_across_atp_datasets.py
Normal file
13
view_data_across_atp_datasets.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
import rich
|
||||
import osm_bot_abstraction_layer.util_download_file
|
||||
import os
|
||||
config = __import__("0_config")
|
||||
|
||||
# https://data.alltheplaces.xyz/runs/history.json
|
||||
# https://alltheplaces-data.openaddresses.io/runs/2025-01-11-13-32-30/output/zabka_pl.geojson
|
||||
# zabka is unstable - count past entries
|
||||
# {"type":"FeatureCollection","dataset_attributes":{"@spider":"zabka_pl","spider:collection_time":"2025-01-13T01:49:37.648114","spider:robots_txt":"ignored"}
|
||||
# filter to ones in Kraków
|
||||
|
||||
#osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
|
||||
|
Loading…
Add table
Reference in a new issue