From 05e856589394ebf1f101120737ed3bfa13764e24 Mon Sep 17 00:00:00 2001
From: Mateusz Konieczny <matkoniecz@gmail.com>
Date: Thu, 6 Mar 2025 10:24:21 +0100
Subject: [PATCH] one more diagnostic script

---
 ...ind_where_multiple_atp_match_to_one_osm.py | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 22_find_where_multiple_atp_match_to_one_osm.py

diff --git a/22_find_where_multiple_atp_match_to_one_osm.py b/22_find_where_multiple_atp_match_to_one_osm.py
new file mode 100644
index 0000000..4149251
--- /dev/null
+++ b/22_find_where_multiple_atp_match_to_one_osm.py
@@ -0,0 +1,79 @@
+import rich
+import data_iterator
+from collections import defaultdict
+obtain_atp_data = __import__("2_obtain_atp_data")
+graticule_report = __import__("5_generate_graticule_reports")
+
+def main():
+    crossspider_multimatch_check()
+    #regular_multimatch_check() # this is normal and happens very often
+
+def crossspider_multimatch_check(): # TODO - near duplicate of regular_multimatch_check - delete regular_multimatch_check? deduplicate?
+    matched_osm_entries = set()
+    multimatches = set()
+    limited_cache_of_full_matchings = defaultdict(list)
+    area = graticule_report.global_graticule_coverage()
+    for atp_code in obtain_atp_data.all_spider_codes_iterator():
+        matched_osm_entries_for_this_spider = set()
+        for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
+            if len(limited_cache_of_full_matchings) > 1_000_000: #500_000:
+                # to avoid busting memory
+                # maybe we can keep more?
+                # TODO
+                limited_cache_of_full_matchings = defaultdict(list)
+            if entry.osm_link != None:
+                if entry.osm_link in matched_osm_entries:
+                    print("matched multiple times, for example on")
+                    log_entry(entry)
+                    multimatches.add(entry.osm_link)
+                    if entry.osm_link in limited_cache_of_full_matchings:
+                        print("also in limited_cache_of_full_matchings")
+                        for from_limited_cache in limited_cache_of_full_matchings[entry.osm_link]:
+                            log_entry(from_limited_cache)
+                    print()
+                    print()
+                    print()
+                    print()
+                    matched_osm_entries.add(entry.osm_link)
+                matched_osm_entries_for_this_spider.add(entry.osm_link)
+                limited_cache_of_full_matchings[entry.osm_link].append(entry)
+        matched_osm_entries = matched_osm_entries | matched_osm_entries_for_this_spider
+
+def regular_multimatch_check():
+    matched_osm_entries = set()
+    multimatches = set()
+    limited_cache_of_full_matchings = defaultdict(list)
+    area = graticule_report.global_graticule_coverage()
+    for atp_code in obtain_atp_data.all_spider_codes_iterator():
+        for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code):
+            if len(limited_cache_of_full_matchings) > 100_000:
+                # to avoid busting memory
+                # maybe we can keep more?
+                # TODO
+                limited_cache_of_full_matchings = defaultdict(list)
+            if entry.osm_link != None:
+                if entry.osm_link in matched_osm_entries:
+                    print("matched multiple times, for example on")
+                    log_entry(entry)
+                    multimatches.add(entry.osm_link)
+                    if entry.osm_link in limited_cache_of_full_matchings:
+                        print("also in limited_cache_of_full_matchings")
+                        for from_limited_cache in limited_cache_of_full_matchings[entry.osm_link]:
+                            log_entry(from_limited_cache)
+                    print()
+                    print()
+                    print()
+                    print()
+                else:
+                    matched_osm_entries.add(entry.osm_link)
+                limited_cache_of_full_matchings[entry.osm_link].append(entry)
+    rich.print(multimatches)
+
+def log_entry(entry):
+    print(entry.osm_link)
+    print(entry.link_to_point_in_atp())
+    print(entry.link_to_point_in_osm())
+    rich.print(entry.atp_tags)
+
+if __name__ == '__main__':
+    main()