From 05e856589394ebf1f101120737ed3bfa13764e24 Mon Sep 17 00:00:00 2001 From: Mateusz Konieczny <matkoniecz@gmail.com> Date: Thu, 6 Mar 2025 10:24:21 +0100 Subject: [PATCH] one more diagnostic script --- ...ind_where_multiple_atp_match_to_one_osm.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 22_find_where_multiple_atp_match_to_one_osm.py diff --git a/22_find_where_multiple_atp_match_to_one_osm.py b/22_find_where_multiple_atp_match_to_one_osm.py new file mode 100644 index 0000000..4149251 --- /dev/null +++ b/22_find_where_multiple_atp_match_to_one_osm.py @@ -0,0 +1,79 @@ +import rich +import data_iterator +from collections import defaultdict +obtain_atp_data = __import__("2_obtain_atp_data") +graticule_report = __import__("5_generate_graticule_reports") + +def main(): + crossspider_multimatch_check() + #regular_multimatch_check() # this is normal and happens very often + +def crossspider_multimatch_check(): # TODO - near duplicate of regular_multimatch_check - delete regular_multimatch_check? deduplicate? + matched_osm_entries = set() + multimatches = set() + limited_cache_of_full_matchings = defaultdict(list) + area = graticule_report.global_graticule_coverage() + for atp_code in obtain_atp_data.all_spider_codes_iterator(): + matched_osm_entries_for_this_spider = set() + for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code): + if len(limited_cache_of_full_matchings) > 1_000_000: #500_000: + # to avoid busting memory + # maybe we can keep more? + # TODO + limited_cache_of_full_matchings = defaultdict(list) + if entry.osm_link != None: + if entry.osm_link in matched_osm_entries: + print("matched multiple times, for example on") + log_entry(entry) + multimatches.add(entry.osm_link) + if entry.osm_link in limited_cache_of_full_matchings: + print("also in limited_cache_of_full_matchings") + for from_limited_cache in limited_cache_of_full_matchings[entry.osm_link]: + log_entry(from_limited_cache) + print() + print() + print() + print() + matched_osm_entries.add(entry.osm_link) + matched_osm_entries_for_this_spider.add(entry.osm_link) + limited_cache_of_full_matchings[entry.osm_link].append(entry) + matched_osm_entries = matched_osm_entries | matched_osm_entries_for_this_spider + +def regular_multimatch_check(): + matched_osm_entries = set() + multimatches = set() + limited_cache_of_full_matchings = defaultdict(list) + area = graticule_report.global_graticule_coverage() + for atp_code in obtain_atp_data.all_spider_codes_iterator(): + for entry in data_iterator.iterate_over_all_matches_for_specific_spider(area, atp_code): + if len(limited_cache_of_full_matchings) > 100_000: + # to avoid busting memory + # maybe we can keep more? + # TODO + limited_cache_of_full_matchings = defaultdict(list) + if entry.osm_link != None: + if entry.osm_link in matched_osm_entries: + print("matched multiple times, for example on") + log_entry(entry) + multimatches.add(entry.osm_link) + if entry.osm_link in limited_cache_of_full_matchings: + print("also in limited_cache_of_full_matchings") + for from_limited_cache in limited_cache_of_full_matchings[entry.osm_link]: + log_entry(from_limited_cache) + print() + print() + print() + print() + else: + matched_osm_entries.add(entry.osm_link) + limited_cache_of_full_matchings[entry.osm_link].append(entry) + rich.print(multimatches) + +def log_entry(entry): + print(entry.osm_link) + print(entry.link_to_point_in_atp()) + print(entry.link_to_point_in_osm()) + rich.print(entry.atp_tags) + +if __name__ == '__main__': + main()