rewrite import-preparing script

2025-05-13 05:03:09 +02:00 · 2025-02-18 19:39:04 +01:00 · 2025-02-18 19:39:04 +01:00 · 2ba7ee61ce
commit 2ba7ee61ce
parent 79571c0a5e
1 changed files with 56 additions and 41 deletions
--- a/21_list_import_status.py
+++ b/21_list_import_status.py
@ -71,15 +71,15 @@ def main():
    overwiev_output_location = config.output_folder() + overview_output_filename
    area = graticule_report.global_graticule_coverage()
    print(area)
-    checked_keys_per_atp, known_listings = get_import_listing_configuration_for_atp_spiders()
-    generate_overview(known_listings, overwiev_output_location) # to reset potentially existing one to prevent confusion
+    checked_keys_per_atp, known_unavailable_listings = get_import_listing_configuration_for_atp_spiders()
+    generate_overview(checked_keys_per_atp, known_unavailable_listings, overwiev_output_location) # to reset potentially existing one to prevent confusion
    for atp_code in checked_keys_per_atp.keys():  # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
        print(atp_code)
        process_single_dataset(checked_keys_per_atp[atp_code], atp_code, area)
        for entry in checked_keys_per_atp[atp_code]:
            del entry['not_yet_generated']
-        generate_overview(known_listings, overwiev_output_location)
-    collected_files_in_output_folder = list_of_output_files(known_listings, overview_output_filename)
+        generate_overview(checked_keys_per_atp, known_unavailable_listings, overwiev_output_location)
+    collected_files_in_output_folder = list_of_output_files(checked_keys_per_atp, overview_output_filename)
    publish_data_on_internet(collected_files_in_output_folder) # TODO enable

 def get_import_listing_configuration_for_atp_spiders():
@ -89,12 +89,13 @@ def get_import_listing_configuration_for_atp_spiders():
        {
            'key':                          # string containing, well, key of processed OSM tag
            'extract_function':             # function pointer - that accepts Match, returns import judgment
-            'output_filename': filename,    # filename where info will be saved
+            'output_filename_for_html': filename,    # filename where info will be saved
+            'output_filename_for_geojson': filename,    # filename where info will be saved
            'not_yet_generated': True
        }
    """
    checked_keys_per_atp = {}
-    known_listings_start = {'opening_hours': {}, 'website': {}}
+    known_unavailable_listings = {'opening_hours': {}, 'website': {}}

    for atp_code in obtain_atp_data.all_spider_codes_iterator():
        if allowed_spider(atp_code) == False:
@ -103,45 +104,45 @@ def get_import_listing_configuration_for_atp_spiders():
        if atp_code in config.spiders_with_known_bad_website_tag():
            # this will throw out entries added there recently
            # after dataset being used was generated
-            known_listings_start['website'][atp_code] = {'status': 'problems in ATP data'}
+            known_unavailable_listings['website'][atp_code] = {'status': 'problems in ATP data'}
        else:
-            filename = "import_possibilities_website_tag_" + atp_code + ".html"
-            known_listings_start['website'][atp_code] = {'filename': filename, 'status': 'available'}
+            filename_html = "import_possibilities_website_tag_" + atp_code + ".html"
+            filename_geojson = "import_possibilities_website_tag_" + atp_code + ".geojson"
            checked_keys.append({
                    'key': 'website',
                    'extract_function': extract_website_import_info,
-                    'output_filename': filename,
+                    'output_filename_for_html': filename_html,
+                    'output_filename_for_geojson': filename_geojson,
                    'not_yet_generated': True
                })
        if atp_code in config.spiders_with_known_bad_opening_hours() or atp_code in config.spiders_with_known_bad_website_tag():
            # this will throw out entries added there recently
            # after dataset being used was generated
-            known_listings_start['opening_hours'][atp_code] = {'status': 'problems in ATP data'}
+            known_unavailable_listings['opening_hours'][atp_code] = {'status': 'problems in ATP data'}
        else:
-            filename = "import_possibilities_opening_hours_tag_" + atp_code + ".html"
-            known_listings_start['opening_hours'][atp_code] = {'filename': filename, 'status': 'available'}
+            filename_html = "import_possibilities_opening_hours_tag_" + atp_code + ".html"
+            filename_geojson = "import_possibilities_opening_hours_tag_" + atp_code + ".geojson"
            checked_keys.append({
                    'key': "opening_hours",
                    'extract_function': extract_opening_hours_import_info,
-                    'output_filename': filename,
+                    'output_filename_for_html': filename_html,
+                    'output_filename_for_geojson': filename_geojson,
                    'not_yet_generated': True
                })
        checked_keys_per_atp[atp_code] = checked_keys
-    return checked_keys_per_atp, known_listings_start
+    return checked_keys_per_atp, known_unavailable_listings



-def list_of_output_files(known_listings, overview_output_filename):
+def list_of_output_files(checked_keys_per_atp, overview_output_filename):
    collected_files_in_output_folder = []
    collected_files_in_output_folder.append(overview_output_filename)
-    for data in known_listings.values():
-        for entry in data.values():
-            if entry.get('status') == 'problems in ATP data':
-                continue
-            elif entry.get('status') == 'available':
-                collected_files_in_output_folder.append(entry['filename'])
-            else:
-                raise Exception("unexpected status in", entry)
+    for atp_code, data_group in checked_keys_per_atp.items():
+        if 'not_yet_generated' not in data_group:
+            # if there are problems in ATP data it should not be generated
+            for data in data_group:
+                collected_files_in_output_folder.append(data['output_filename_for_html'])
+                collected_files_in_output_folder.append(data['output_filename_for_geojson'])
    return collected_files_in_output_folder

 def publish_data_on_internet(collected_files_in_output_folder):
@ -150,21 +151,18 @@ def publish_data_on_internet(collected_files_in_output_folder):
        os.system('cd "' + config.published_output_folder() + '" && git add ' + file)
    os.system('cd "' + config.published_output_folder() + '" && git commit -m "automatic update (import possibilities listing)"')

-def generate_overview(known_listings, location):
-    def format_relevant_listing_into_field(relevant_listing):
-        if atp_code in relevant_listing and 'not_yet_generated' not in relevant_listing:
-            status = relevant_listing[atp_code]['status']
-            if status == 'available':
-                filename = relevant_listing[atp_code]['filename']
-                return {'type': 'link', 'value': {'text': 'list', 'url': filename}}
-            elif status == 'problems in ATP data':
-                return {'type': 'text', 'value': 'problems in ATP data'}
-            else:
-                raise Exception('unexpected status ' + status)
-        return {'type': 'text', 'value': '????'}
+def generate_overview(checked_keys_per_atp, known_unavailable_listings, location):
+    def format_relevant_listing_into_field(atp_code, checked_key_info, unavailability_if_any):
+        if unavailability_if_any != None:
+            return {'type': 'text', 'value': unavailability_if_any['status']}
+        if 'not_yet_generated' in checked_key_info:
+            return {'type': 'text', 'value': '????'}
+        filename = checked_key_info['output_filename_for_html']
+        return {'type': 'link', 'value': {'text': 'list', 'url': filename}}
+
    relevant_atp_codes = set()
-    for listing in known_listings:
-        for atp_code in listing.keys():
+    for atp_code, data in checked_keys_per_atp.items():
+        if 'not_yet_generated' not in data:
            relevant_atp_codes.add(atp_code)
    with open(location, 'w') as outfile:
        output = prose.universal_html_prefix("Import possibilities")
@ -173,11 +171,20 @@ def generate_overview(known_listings, location):
        output += "<p>List below </p>"
        output += prose.quality_disclaimer()
        spider_data = []
-        sorted_importable_keys = sorted(known_listings.keys())
+        sorted_importable_keys = sorted(known_unavailable_listings.keys())
        for atp_code in relevant_atp_codes:
            columns = [{'type': 'text', 'value': atp_code}]
            for key in sorted_importable_keys:
-                columns.append(format_relevant_listing_into_field(known_listings[key]))
+                unavailability_if_any = known_unavailable_listings[key].get(atp_code)
+                checked_key_info = None
+                for potential_checked_key_info in checked_keys_per_atp[atp_code]:
+                    if potential_checked_key_info['key'] == key:
+                        checked_key_info = potential_checked_key_info
+                if checked_key_info == None and unavailability_if_any == None:
+                    rich.print(checked_keys_per_atp[atp_code])
+                    rich.print(unavailability_if_any)
+                    raise
+                columns.append(format_relevant_listing_into_field(atp_code, checked_key_info, unavailability_if_any))
            spider_data.append({
                'columns': columns,
                'display_type': 'normal',
@ -200,6 +207,8 @@ def process_single_dataset(checked_data_sources, atp_code, area):
    for entry in entries:
        links_per_osm_object[entry.osm_link] += 1

+    apparently_importable = defaultdict(dict)
+
    for entry in entries:
        if entry.osm_link in skipped_osm_cases():
            continue
@ -223,9 +232,10 @@ def process_single_dataset(checked_data_sources, atp_code, area):
            # so it is preferable to do it sparingly
            import_judgment = adjust_judgment_for_address_location_mismatch_checked_by_nominatim(entry, import_judgment)
            extracted['data'].append(entry_to_presentation_object(extracted['key'], entry, import_judgment))
+            apparently_importable[extracted['key']].append(entry)

    for extracted in checked_data_sources:
-        filename = extracted['output_filename']
+        filename = extracted['output_filename_for_html']
        location = config.output_folder() + filename
        with open(location, 'w') as outfile:
            output = prose.universal_html_prefix(atp_code + " " + extracted['key'] + " import candidates")
@ -239,6 +249,11 @@ def process_single_dataset(checked_data_sources, atp_code, area):
            outfile.write(output)
        print(f"wrote file to {location}")

+        geojson_data = serializing.generate_geojson_structure(apparently_importable[extracted['key']])
+        with open(config.output_folder() + extracted['output_filename_for_geojson'], 'w') as f:
+            json.dump(geojson_data, f)
+        print(f"wrote geojson file to {extracted['output_filename_for_geojson']}")
+
 def adjust_judgment_for_address_location_mismatch_checked_by_nominatim(entry, judgment):
    if judgment['status'] in ['it_is_not_matching', 'dubious_match', 'no_import_for_this_key']:
        # no import, no need to check Nominatim