mirror of
https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data.git
synced 2025-05-13 05:03:09 +02:00
rewrite import-preparing script
This commit is contained in:
parent
79571c0a5e
commit
2ba7ee61ce
1 changed files with 56 additions and 41 deletions
|
@ -71,15 +71,15 @@ def main():
|
|||
overwiev_output_location = config.output_folder() + overview_output_filename
|
||||
area = graticule_report.global_graticule_coverage()
|
||||
print(area)
|
||||
checked_keys_per_atp, known_listings = get_import_listing_configuration_for_atp_spiders()
|
||||
generate_overview(known_listings, overwiev_output_location) # to reset potentially existing one to prevent confusion
|
||||
checked_keys_per_atp, known_unavailable_listings = get_import_listing_configuration_for_atp_spiders()
|
||||
generate_overview(checked_keys_per_atp, known_unavailable_listings, overwiev_output_location) # to reset potentially existing one to prevent confusion
|
||||
for atp_code in checked_keys_per_atp.keys(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
|
||||
print(atp_code)
|
||||
process_single_dataset(checked_keys_per_atp[atp_code], atp_code, area)
|
||||
for entry in checked_keys_per_atp[atp_code]:
|
||||
del entry['not_yet_generated']
|
||||
generate_overview(known_listings, overwiev_output_location)
|
||||
collected_files_in_output_folder = list_of_output_files(known_listings, overview_output_filename)
|
||||
generate_overview(checked_keys_per_atp, known_unavailable_listings, overwiev_output_location)
|
||||
collected_files_in_output_folder = list_of_output_files(checked_keys_per_atp, overview_output_filename)
|
||||
publish_data_on_internet(collected_files_in_output_folder) # TODO enable
|
||||
|
||||
def get_import_listing_configuration_for_atp_spiders():
|
||||
|
@ -89,12 +89,13 @@ def get_import_listing_configuration_for_atp_spiders():
|
|||
{
|
||||
'key': # string containing, well, key of processed OSM tag
|
||||
'extract_function': # function pointer - that accepts Match, returns import judgment
|
||||
'output_filename': filename, # filename where info will be saved
|
||||
'output_filename_for_html': filename, # filename where info will be saved
|
||||
'output_filename_for_geojson': filename, # filename where info will be saved
|
||||
'not_yet_generated': True
|
||||
}
|
||||
"""
|
||||
checked_keys_per_atp = {}
|
||||
known_listings_start = {'opening_hours': {}, 'website': {}}
|
||||
known_unavailable_listings = {'opening_hours': {}, 'website': {}}
|
||||
|
||||
for atp_code in obtain_atp_data.all_spider_codes_iterator():
|
||||
if allowed_spider(atp_code) == False:
|
||||
|
@ -103,45 +104,45 @@ def get_import_listing_configuration_for_atp_spiders():
|
|||
if atp_code in config.spiders_with_known_bad_website_tag():
|
||||
# this will throw out entries added there recently
|
||||
# after dataset being used was generated
|
||||
known_listings_start['website'][atp_code] = {'status': 'problems in ATP data'}
|
||||
known_unavailable_listings['website'][atp_code] = {'status': 'problems in ATP data'}
|
||||
else:
|
||||
filename = "import_possibilities_website_tag_" + atp_code + ".html"
|
||||
known_listings_start['website'][atp_code] = {'filename': filename, 'status': 'available'}
|
||||
filename_html = "import_possibilities_website_tag_" + atp_code + ".html"
|
||||
filename_geojson = "import_possibilities_website_tag_" + atp_code + ".geojson"
|
||||
checked_keys.append({
|
||||
'key': 'website',
|
||||
'extract_function': extract_website_import_info,
|
||||
'output_filename': filename,
|
||||
'output_filename_for_html': filename_html,
|
||||
'output_filename_for_geojson': filename_geojson,
|
||||
'not_yet_generated': True
|
||||
})
|
||||
if atp_code in config.spiders_with_known_bad_opening_hours() or atp_code in config.spiders_with_known_bad_website_tag():
|
||||
# this will throw out entries added there recently
|
||||
# after dataset being used was generated
|
||||
known_listings_start['opening_hours'][atp_code] = {'status': 'problems in ATP data'}
|
||||
known_unavailable_listings['opening_hours'][atp_code] = {'status': 'problems in ATP data'}
|
||||
else:
|
||||
filename = "import_possibilities_opening_hours_tag_" + atp_code + ".html"
|
||||
known_listings_start['opening_hours'][atp_code] = {'filename': filename, 'status': 'available'}
|
||||
filename_html = "import_possibilities_opening_hours_tag_" + atp_code + ".html"
|
||||
filename_geojson = "import_possibilities_opening_hours_tag_" + atp_code + ".geojson"
|
||||
checked_keys.append({
|
||||
'key': "opening_hours",
|
||||
'extract_function': extract_opening_hours_import_info,
|
||||
'output_filename': filename,
|
||||
'output_filename_for_html': filename_html,
|
||||
'output_filename_for_geojson': filename_geojson,
|
||||
'not_yet_generated': True
|
||||
})
|
||||
checked_keys_per_atp[atp_code] = checked_keys
|
||||
return checked_keys_per_atp, known_listings_start
|
||||
return checked_keys_per_atp, known_unavailable_listings
|
||||
|
||||
|
||||
|
||||
def list_of_output_files(known_listings, overview_output_filename):
|
||||
def list_of_output_files(checked_keys_per_atp, overview_output_filename):
|
||||
collected_files_in_output_folder = []
|
||||
collected_files_in_output_folder.append(overview_output_filename)
|
||||
for data in known_listings.values():
|
||||
for entry in data.values():
|
||||
if entry.get('status') == 'problems in ATP data':
|
||||
continue
|
||||
elif entry.get('status') == 'available':
|
||||
collected_files_in_output_folder.append(entry['filename'])
|
||||
else:
|
||||
raise Exception("unexpected status in", entry)
|
||||
for atp_code, data_group in checked_keys_per_atp.items():
|
||||
if 'not_yet_generated' not in data_group:
|
||||
# if there are problems in ATP data it should not be generated
|
||||
for data in data_group:
|
||||
collected_files_in_output_folder.append(data['output_filename_for_html'])
|
||||
collected_files_in_output_folder.append(data['output_filename_for_geojson'])
|
||||
return collected_files_in_output_folder
|
||||
|
||||
def publish_data_on_internet(collected_files_in_output_folder):
|
||||
|
@ -150,21 +151,18 @@ def publish_data_on_internet(collected_files_in_output_folder):
|
|||
os.system('cd "' + config.published_output_folder() + '" && git add ' + file)
|
||||
os.system('cd "' + config.published_output_folder() + '" && git commit -m "automatic update (import possibilities listing)"')
|
||||
|
||||
def generate_overview(known_listings, location):
|
||||
def format_relevant_listing_into_field(relevant_listing):
|
||||
if atp_code in relevant_listing and 'not_yet_generated' not in relevant_listing:
|
||||
status = relevant_listing[atp_code]['status']
|
||||
if status == 'available':
|
||||
filename = relevant_listing[atp_code]['filename']
|
||||
return {'type': 'link', 'value': {'text': 'list', 'url': filename}}
|
||||
elif status == 'problems in ATP data':
|
||||
return {'type': 'text', 'value': 'problems in ATP data'}
|
||||
else:
|
||||
raise Exception('unexpected status ' + status)
|
||||
return {'type': 'text', 'value': '????'}
|
||||
def generate_overview(checked_keys_per_atp, known_unavailable_listings, location):
|
||||
def format_relevant_listing_into_field(atp_code, checked_key_info, unavailability_if_any):
|
||||
if unavailability_if_any != None:
|
||||
return {'type': 'text', 'value': unavailability_if_any['status']}
|
||||
if 'not_yet_generated' in checked_key_info:
|
||||
return {'type': 'text', 'value': '????'}
|
||||
filename = checked_key_info['output_filename_for_html']
|
||||
return {'type': 'link', 'value': {'text': 'list', 'url': filename}}
|
||||
|
||||
relevant_atp_codes = set()
|
||||
for listing in known_listings:
|
||||
for atp_code in listing.keys():
|
||||
for atp_code, data in checked_keys_per_atp.items():
|
||||
if 'not_yet_generated' not in data:
|
||||
relevant_atp_codes.add(atp_code)
|
||||
with open(location, 'w') as outfile:
|
||||
output = prose.universal_html_prefix("Import possibilities")
|
||||
|
@ -173,11 +171,20 @@ def generate_overview(known_listings, location):
|
|||
output += "<p>List below </p>"
|
||||
output += prose.quality_disclaimer()
|
||||
spider_data = []
|
||||
sorted_importable_keys = sorted(known_listings.keys())
|
||||
sorted_importable_keys = sorted(known_unavailable_listings.keys())
|
||||
for atp_code in relevant_atp_codes:
|
||||
columns = [{'type': 'text', 'value': atp_code}]
|
||||
for key in sorted_importable_keys:
|
||||
columns.append(format_relevant_listing_into_field(known_listings[key]))
|
||||
unavailability_if_any = known_unavailable_listings[key].get(atp_code)
|
||||
checked_key_info = None
|
||||
for potential_checked_key_info in checked_keys_per_atp[atp_code]:
|
||||
if potential_checked_key_info['key'] == key:
|
||||
checked_key_info = potential_checked_key_info
|
||||
if checked_key_info == None and unavailability_if_any == None:
|
||||
rich.print(checked_keys_per_atp[atp_code])
|
||||
rich.print(unavailability_if_any)
|
||||
raise
|
||||
columns.append(format_relevant_listing_into_field(atp_code, checked_key_info, unavailability_if_any))
|
||||
spider_data.append({
|
||||
'columns': columns,
|
||||
'display_type': 'normal',
|
||||
|
@ -200,6 +207,8 @@ def process_single_dataset(checked_data_sources, atp_code, area):
|
|||
for entry in entries:
|
||||
links_per_osm_object[entry.osm_link] += 1
|
||||
|
||||
apparently_importable = defaultdict(dict)
|
||||
|
||||
for entry in entries:
|
||||
if entry.osm_link in skipped_osm_cases():
|
||||
continue
|
||||
|
@ -223,9 +232,10 @@ def process_single_dataset(checked_data_sources, atp_code, area):
|
|||
# so it is preferable to do it sparingly
|
||||
import_judgment = adjust_judgment_for_address_location_mismatch_checked_by_nominatim(entry, import_judgment)
|
||||
extracted['data'].append(entry_to_presentation_object(extracted['key'], entry, import_judgment))
|
||||
apparently_importable[extracted['key']].append(entry)
|
||||
|
||||
for extracted in checked_data_sources:
|
||||
filename = extracted['output_filename']
|
||||
filename = extracted['output_filename_for_html']
|
||||
location = config.output_folder() + filename
|
||||
with open(location, 'w') as outfile:
|
||||
output = prose.universal_html_prefix(atp_code + " " + extracted['key'] + " import candidates")
|
||||
|
@ -239,6 +249,11 @@ def process_single_dataset(checked_data_sources, atp_code, area):
|
|||
outfile.write(output)
|
||||
print(f"wrote file to {location}")
|
||||
|
||||
geojson_data = serializing.generate_geojson_structure(apparently_importable[extracted['key']])
|
||||
with open(config.output_folder() + extracted['output_filename_for_geojson'], 'w') as f:
|
||||
json.dump(geojson_data, f)
|
||||
print(f"wrote geojson file to {extracted['output_filename_for_geojson']}")
|
||||
|
||||
def adjust_judgment_for_address_location_mismatch_checked_by_nominatim(entry, judgment):
|
||||
if judgment['status'] in ['it_is_not_matching', 'dubious_match', 'no_import_for_this_key']:
|
||||
# no import, no need to check Nominatim
|
||||
|
|
Loading…
Add table
Reference in a new issue