1
0
Fork 0

rewrite import-preparing script

This commit is contained in:
Mateusz Konieczny 2025-02-18 19:39:04 +01:00
parent 79571c0a5e
commit 2ba7ee61ce

View file

@ -71,15 +71,15 @@ def main():
overwiev_output_location = config.output_folder() + overview_output_filename
area = graticule_report.global_graticule_coverage()
print(area)
checked_keys_per_atp, known_listings = get_import_listing_configuration_for_atp_spiders()
generate_overview(known_listings, overwiev_output_location) # to reset potentially existing one to prevent confusion
checked_keys_per_atp, known_unavailable_listings = get_import_listing_configuration_for_atp_spiders()
generate_overview(checked_keys_per_atp, known_unavailable_listings, overwiev_output_location) # to reset potentially existing one to prevent confusion
for atp_code in checked_keys_per_atp.keys(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
print(atp_code)
process_single_dataset(checked_keys_per_atp[atp_code], atp_code, area)
for entry in checked_keys_per_atp[atp_code]:
del entry['not_yet_generated']
generate_overview(known_listings, overwiev_output_location)
collected_files_in_output_folder = list_of_output_files(known_listings, overview_output_filename)
generate_overview(checked_keys_per_atp, known_unavailable_listings, overwiev_output_location)
collected_files_in_output_folder = list_of_output_files(checked_keys_per_atp, overview_output_filename)
publish_data_on_internet(collected_files_in_output_folder) # TODO enable
def get_import_listing_configuration_for_atp_spiders():
@ -89,12 +89,13 @@ def get_import_listing_configuration_for_atp_spiders():
{
'key': # string containing, well, key of processed OSM tag
'extract_function': # function pointer - that accepts Match, returns import judgment
'output_filename': filename, # filename where info will be saved
'output_filename_for_html': filename, # filename where info will be saved
'output_filename_for_geojson': filename, # filename where info will be saved
'not_yet_generated': True
}
"""
checked_keys_per_atp = {}
known_listings_start = {'opening_hours': {}, 'website': {}}
known_unavailable_listings = {'opening_hours': {}, 'website': {}}
for atp_code in obtain_atp_data.all_spider_codes_iterator():
if allowed_spider(atp_code) == False:
@ -103,45 +104,45 @@ def get_import_listing_configuration_for_atp_spiders():
if atp_code in config.spiders_with_known_bad_website_tag():
# this will throw out entries added there recently
# after dataset being used was generated
known_listings_start['website'][atp_code] = {'status': 'problems in ATP data'}
known_unavailable_listings['website'][atp_code] = {'status': 'problems in ATP data'}
else:
filename = "import_possibilities_website_tag_" + atp_code + ".html"
known_listings_start['website'][atp_code] = {'filename': filename, 'status': 'available'}
filename_html = "import_possibilities_website_tag_" + atp_code + ".html"
filename_geojson = "import_possibilities_website_tag_" + atp_code + ".geojson"
checked_keys.append({
'key': 'website',
'extract_function': extract_website_import_info,
'output_filename': filename,
'output_filename_for_html': filename_html,
'output_filename_for_geojson': filename_geojson,
'not_yet_generated': True
})
if atp_code in config.spiders_with_known_bad_opening_hours() or atp_code in config.spiders_with_known_bad_website_tag():
# this will throw out entries added there recently
# after dataset being used was generated
known_listings_start['opening_hours'][atp_code] = {'status': 'problems in ATP data'}
known_unavailable_listings['opening_hours'][atp_code] = {'status': 'problems in ATP data'}
else:
filename = "import_possibilities_opening_hours_tag_" + atp_code + ".html"
known_listings_start['opening_hours'][atp_code] = {'filename': filename, 'status': 'available'}
filename_html = "import_possibilities_opening_hours_tag_" + atp_code + ".html"
filename_geojson = "import_possibilities_opening_hours_tag_" + atp_code + ".geojson"
checked_keys.append({
'key': "opening_hours",
'extract_function': extract_opening_hours_import_info,
'output_filename': filename,
'output_filename_for_html': filename_html,
'output_filename_for_geojson': filename_geojson,
'not_yet_generated': True
})
checked_keys_per_atp[atp_code] = checked_keys
return checked_keys_per_atp, known_listings_start
return checked_keys_per_atp, known_unavailable_listings
def list_of_output_files(known_listings, overview_output_filename):
def list_of_output_files(checked_keys_per_atp, overview_output_filename):
collected_files_in_output_folder = []
collected_files_in_output_folder.append(overview_output_filename)
for data in known_listings.values():
for entry in data.values():
if entry.get('status') == 'problems in ATP data':
continue
elif entry.get('status') == 'available':
collected_files_in_output_folder.append(entry['filename'])
else:
raise Exception("unexpected status in", entry)
for atp_code, data_group in checked_keys_per_atp.items():
if 'not_yet_generated' not in data_group:
# if there are problems in ATP data it should not be generated
for data in data_group:
collected_files_in_output_folder.append(data['output_filename_for_html'])
collected_files_in_output_folder.append(data['output_filename_for_geojson'])
return collected_files_in_output_folder
def publish_data_on_internet(collected_files_in_output_folder):
@ -150,21 +151,18 @@ def publish_data_on_internet(collected_files_in_output_folder):
os.system('cd "' + config.published_output_folder() + '" && git add ' + file)
os.system('cd "' + config.published_output_folder() + '" && git commit -m "automatic update (import possibilities listing)"')
def generate_overview(known_listings, location):
def format_relevant_listing_into_field(relevant_listing):
if atp_code in relevant_listing and 'not_yet_generated' not in relevant_listing:
status = relevant_listing[atp_code]['status']
if status == 'available':
filename = relevant_listing[atp_code]['filename']
return {'type': 'link', 'value': {'text': 'list', 'url': filename}}
elif status == 'problems in ATP data':
return {'type': 'text', 'value': 'problems in ATP data'}
else:
raise Exception('unexpected status ' + status)
return {'type': 'text', 'value': '????'}
def generate_overview(checked_keys_per_atp, known_unavailable_listings, location):
def format_relevant_listing_into_field(atp_code, checked_key_info, unavailability_if_any):
if unavailability_if_any != None:
return {'type': 'text', 'value': unavailability_if_any['status']}
if 'not_yet_generated' in checked_key_info:
return {'type': 'text', 'value': '????'}
filename = checked_key_info['output_filename_for_html']
return {'type': 'link', 'value': {'text': 'list', 'url': filename}}
relevant_atp_codes = set()
for listing in known_listings:
for atp_code in listing.keys():
for atp_code, data in checked_keys_per_atp.items():
if 'not_yet_generated' not in data:
relevant_atp_codes.add(atp_code)
with open(location, 'w') as outfile:
output = prose.universal_html_prefix("Import possibilities")
@ -173,11 +171,20 @@ def generate_overview(known_listings, location):
output += "<p>List below </p>"
output += prose.quality_disclaimer()
spider_data = []
sorted_importable_keys = sorted(known_listings.keys())
sorted_importable_keys = sorted(known_unavailable_listings.keys())
for atp_code in relevant_atp_codes:
columns = [{'type': 'text', 'value': atp_code}]
for key in sorted_importable_keys:
columns.append(format_relevant_listing_into_field(known_listings[key]))
unavailability_if_any = known_unavailable_listings[key].get(atp_code)
checked_key_info = None
for potential_checked_key_info in checked_keys_per_atp[atp_code]:
if potential_checked_key_info['key'] == key:
checked_key_info = potential_checked_key_info
if checked_key_info == None and unavailability_if_any == None:
rich.print(checked_keys_per_atp[atp_code])
rich.print(unavailability_if_any)
raise
columns.append(format_relevant_listing_into_field(atp_code, checked_key_info, unavailability_if_any))
spider_data.append({
'columns': columns,
'display_type': 'normal',
@ -200,6 +207,8 @@ def process_single_dataset(checked_data_sources, atp_code, area):
for entry in entries:
links_per_osm_object[entry.osm_link] += 1
apparently_importable = defaultdict(dict)
for entry in entries:
if entry.osm_link in skipped_osm_cases():
continue
@ -223,9 +232,10 @@ def process_single_dataset(checked_data_sources, atp_code, area):
# so it is preferable to do it sparingly
import_judgment = adjust_judgment_for_address_location_mismatch_checked_by_nominatim(entry, import_judgment)
extracted['data'].append(entry_to_presentation_object(extracted['key'], entry, import_judgment))
apparently_importable[extracted['key']].append(entry)
for extracted in checked_data_sources:
filename = extracted['output_filename']
filename = extracted['output_filename_for_html']
location = config.output_folder() + filename
with open(location, 'w') as outfile:
output = prose.universal_html_prefix(atp_code + " " + extracted['key'] + " import candidates")
@ -239,6 +249,11 @@ def process_single_dataset(checked_data_sources, atp_code, area):
outfile.write(output)
print(f"wrote file to {location}")
geojson_data = serializing.generate_geojson_structure(apparently_importable[extracted['key']])
with open(config.output_folder() + extracted['output_filename_for_geojson'], 'w') as f:
json.dump(geojson_data, f)
print(f"wrote geojson file to {extracted['output_filename_for_geojson']}")
def adjust_judgment_for_address_location_mismatch_checked_by_nominatim(entry, judgment):
if judgment['status'] in ['it_is_not_matching', 'dubious_match', 'no_import_for_this_key']:
# no import, no need to check Nominatim