mirror of
https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data.git
synced 2025-04-11 10:09:29 +02:00
autopep8 --in-place --max-line-length=420 --recursive .
This commit is contained in:
parent
9e9f5af2b0
commit
6e5070861f
24 changed files with 284 additions and 163 deletions
0_config.py1_obtain_osm_data.py2_obtain_atp_data.py3_matcher.py4_show_data.py5_generate_organic_map_bookmarks.py6_experimental_graticule_splitter.py7_experimental_taginfo_tag_lister.pydistance_distribution.pylink_scan_worker.pynominatim.pynominatim_worker.pyqa.pyrun.pyserializing.pyshared.pyspatial_index.pytest_display_website.pytest_general_smoke_test.pytest_matching_logic.pytest_processing.pytest_spatial_index.pyurl_checker.pyurl_checker_test.py
126
0_config.py
126
0_config.py
File diff suppressed because one or more lines are too long
|
@ -5,6 +5,7 @@ import shops
|
|||
import time
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def main():
|
||||
directory_path = config.cache_folder()
|
||||
|
||||
|
@ -22,10 +23,11 @@ def main():
|
|||
print(region, "- list_shops - started")
|
||||
start = time.time()
|
||||
for entry in shops.osm.list_shops(region, directory_path):
|
||||
pass # needed to trigger processing code
|
||||
pass # needed to trigger processing code
|
||||
print((time.time() - start) / 60, "minutes")
|
||||
print(region, "- list_shops - completed")
|
||||
processed.append(region)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,18 +1,20 @@
|
|||
import rich
|
||||
import osm_bot_abstraction_layer.util_download_file
|
||||
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
config = __import__("0_config")
|
||||
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
|
||||
import osm_bot_abstraction_layer.util_download_file
|
||||
import rich
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def processed_atp_codes():
|
||||
for area_name, area_data in config.processing_plan().items():
|
||||
if 'accepted' in area_data:
|
||||
for atp_code in area_data['accepted']:
|
||||
yield atp_code
|
||||
|
||||
|
||||
def main():
|
||||
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json")
|
||||
todos = json.loads(response.text)
|
||||
|
@ -21,14 +23,15 @@ def main():
|
|||
download_entire_atp_dataset(run_id)
|
||||
look_through_entire_atp_dataset()
|
||||
|
||||
|
||||
def do_not_remind_that_this_tagging_may_be_worth_supporting():
|
||||
notified_about_tag = {
|
||||
# should be fixed in ATP, if possible
|
||||
# TODO: raise isses at https://github.com/alltheplaces/alltheplaces/issues
|
||||
'tourism': ['yes', 'attraction'],
|
||||
'healthcare': [
|
||||
'laboratory', # https://github.com/alltheplaces/alltheplaces/issues/8637
|
||||
'centre', # not reported yet TODO
|
||||
'laboratory', # https://github.com/alltheplaces/alltheplaces/issues/8637
|
||||
'centre', # not reported yet TODO
|
||||
],
|
||||
|
||||
# TODO maybe start including them?
|
||||
|
@ -48,11 +51,11 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting():
|
|||
if key not in notified_about_tag:
|
||||
notified_about_tag[key] = []
|
||||
|
||||
#missing shoplike
|
||||
# missing shoplike
|
||||
notified_about_tag['office'].append('yes')
|
||||
notified_about_tag['amenity'].append('canteen')
|
||||
|
||||
#kind also shoplike? I want to support them
|
||||
# kind also shoplike? I want to support them
|
||||
notified_about_tag['man_made'].append('charge_point')
|
||||
notified_about_tag['amenity'].append('music_venue')
|
||||
notified_about_tag['amenity'].append('prep_school')
|
||||
|
@ -117,9 +120,10 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting():
|
|||
|
||||
# seem hard to confirm by survey
|
||||
notified_about_tag['craft'].append('brewery')
|
||||
notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office
|
||||
notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office
|
||||
return notified_about_tag
|
||||
|
||||
|
||||
def warn_about_broken_spider(atp_code, message):
|
||||
print()
|
||||
print()
|
||||
|
@ -130,24 +134,27 @@ def warn_about_broken_spider(atp_code, message):
|
|||
print(url_log)
|
||||
print()
|
||||
|
||||
|
||||
def maybe_warn_about_spider_with_empty_file(atp_code):
|
||||
if config.is_empty_file_for_spider_worth_mentioning(atp_code):
|
||||
warn_about_broken_spider(atp_code, "empty output")
|
||||
|
||||
|
||||
def maybe_warn_about_spider_with_broken_file(atp_code):
|
||||
if config.is_broken_file_for_spider_worth_mentioning(atp_code):
|
||||
warn_about_broken_spider(atp_code, "broken output file")
|
||||
|
||||
|
||||
def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path):
|
||||
notified_about_tag = do_not_remind_that_this_tagging_may_be_worth_supporting()
|
||||
for key in tag_knowledge.typical_main_keys():
|
||||
if key in entry['properties']:
|
||||
if ";" in entry['properties'][key]:
|
||||
continue # TODO - is it safe to consider it as being unfinished?
|
||||
continue # TODO - is it safe to consider it as being unfinished?
|
||||
if key == 'healthcare' and entry['properties'].get("amenity") == entry['properties'].get("healthcare"):
|
||||
continue
|
||||
if tag_knowledge.is_shoplike({key: entry['properties'][key]}) == True:
|
||||
break # handles cases where healthcare is extra tag in addition to proper amenity
|
||||
break # handles cases where healthcare is extra tag in addition to proper amenity
|
||||
if tag_knowledge.is_shoplike({key: entry['properties'][key]}) == False:
|
||||
if key not in notified_about_tag:
|
||||
notified_about_tag[key] = []
|
||||
|
@ -159,6 +166,7 @@ def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path):
|
|||
print()
|
||||
notified_about_tag[key].append(entry['properties'][key])
|
||||
|
||||
|
||||
def all_spider_codes_iterator():
|
||||
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
|
||||
# TODO: Is there match between spider codes and their filenames?
|
||||
|
@ -180,6 +188,7 @@ def download_entire_atp_dataset(run_id):
|
|||
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
|
||||
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
|
||||
|
||||
|
||||
def look_through_entire_atp_dataset():
|
||||
candidates = {}
|
||||
for _area_name, area_data in config.processing_plan().items():
|
||||
|
@ -200,10 +209,10 @@ def look_through_entire_atp_dataset():
|
|||
continue
|
||||
warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path)
|
||||
if atp_code in [
|
||||
'hyatt', # https://github.com/alltheplaces/alltheplaces/issues/9399
|
||||
'maserati', # has many actually empty entries
|
||||
'skoda', # limitations of source data, unfixable by ATP
|
||||
'general_logistics_systems_de', # missing data to provide POI data, see https://github.com/alltheplaces/alltheplaces/commit/89f5511bacf24f2d6d0a1c2a183130c9148f772a
|
||||
'hyatt', # https://github.com/alltheplaces/alltheplaces/issues/9399
|
||||
'maserati', # has many actually empty entries
|
||||
'skoda', # limitations of source data, unfixable by ATP
|
||||
'general_logistics_systems_de', # missing data to provide POI data, see https://github.com/alltheplaces/alltheplaces/commit/89f5511bacf24f2d6d0a1c2a183130c9148f772a
|
||||
]:
|
||||
break
|
||||
if config.canonical_feature(entry['properties']) == "?":
|
||||
|
@ -226,11 +235,12 @@ def look_through_entire_atp_dataset():
|
|||
print()
|
||||
print("candidate")
|
||||
print(item_path)
|
||||
if atp_code not in candidates[area_data['country_code']]: # applies when one spider has many main keys
|
||||
if atp_code not in candidates[area_data['country_code']]: # applies when one spider has many main keys
|
||||
candidates[area_data['country_code']].append(atp_code)
|
||||
print(key, "=", entry['properties'][key])
|
||||
print(candidates)
|
||||
|
||||
|
||||
def download(code, run_id):
|
||||
script_location = os.path.abspath(__file__)
|
||||
directory_path = config.cache_folder()
|
||||
|
@ -238,5 +248,6 @@ def download(code, run_id):
|
|||
filename = code + ".atp.geojson"
|
||||
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
13
3_matcher.py
13
3_matcher.py
|
@ -52,6 +52,7 @@ def is_matching_any_name_part_to_osm_tags(name_part_list, osm_tags):
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
def matching_name_part(part, namelike_value):
|
||||
part = part.lower()
|
||||
namelike_value = namelike_value.lower()
|
||||
|
@ -63,6 +64,7 @@ def matching_name_part(part, namelike_value):
|
|||
else:
|
||||
return True
|
||||
|
||||
|
||||
def filter_with_fuzzy_name_match(osm_data, name_part_list):
|
||||
returned = []
|
||||
for osm in osm_data:
|
||||
|
@ -70,6 +72,7 @@ def filter_with_fuzzy_name_match(osm_data, name_part_list):
|
|||
returned.append(osm)
|
||||
return returned
|
||||
|
||||
|
||||
def get_filter_names_from_atp_dataset(current_atp):
|
||||
filter_names = []
|
||||
for atp in current_atp:
|
||||
|
@ -79,12 +82,13 @@ def get_filter_names_from_atp_dataset(current_atp):
|
|||
name_sources.append(short_name)
|
||||
for name in name_sources:
|
||||
for part in name.split():
|
||||
if part.lower() in ["kebab", "kebap", "apteka", "cukiernia", "pizzeria", "na"]: # common and shared - automate detection?
|
||||
if part.lower() in ["kebab", "kebap", "apteka", "cukiernia", "pizzeria", "na"]: # common and shared - automate detection?
|
||||
continue
|
||||
if part not in filter_names:
|
||||
filter_names.append(part)
|
||||
return filter_names
|
||||
|
||||
|
||||
def run_match(osm_data, atp_code):
|
||||
output_file = "build_temporary_files/" + atp_code + '.csv'
|
||||
atp_data = load_atp(atp_code)
|
||||
|
@ -93,11 +97,12 @@ def run_match(osm_data, atp_code):
|
|||
matches = get_matches(osm_data, atp_data)
|
||||
serializing.save_list_of_matches_to_csv(output_file, matches)
|
||||
|
||||
|
||||
def get_matches(osm_data, atp_data):
|
||||
match_list = []
|
||||
filter_names = get_filter_names_from_atp_dataset(atp_data)
|
||||
#TODO: get also misspellings
|
||||
#TODO: handle nearby objects with matching feature type or vacant ones
|
||||
# TODO: get also misspellings
|
||||
# TODO: handle nearby objects with matching feature type or vacant ones
|
||||
filtered_osm = filter_with_fuzzy_name_match(osm_data, filter_names)
|
||||
osm_index = spatial_index.SpatialIndex(filtered_osm)
|
||||
print("filtering", len(osm_data), "to", len(filtered_osm), "candidates based on names is done, now checking", len(atp_data), "ATP candidates by distance")
|
||||
|
@ -145,6 +150,7 @@ def get_matches(osm_data, atp_data):
|
|||
raise
|
||||
return match_list
|
||||
|
||||
|
||||
def passed_filter(osm_data_tag_filter, tags):
|
||||
for key in osm_data_tag_filter.keys():
|
||||
if osm_data_tag_filter[key] == None:
|
||||
|
@ -173,6 +179,7 @@ def load_atp(atp_code):
|
|||
# no need to report also here, so lets fail silently
|
||||
return []
|
||||
|
||||
|
||||
def load_atp_from_json(data, atp_code):
|
||||
returned = []
|
||||
for entry in data['features']:
|
||||
|
|
|
@ -18,6 +18,7 @@ import distance_distribution
|
|||
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def get_free_space_in_mb(path):
|
||||
total, used, free = shutil.disk_usage(path)
|
||||
return free / 1024 / 1024
|
||||
|
@ -54,6 +55,7 @@ def generate_report(cache_only):
|
|||
copy_data_for_publication(all_atp_codes)
|
||||
publish_data_on_internet()
|
||||
|
||||
|
||||
def generate_bot_edit_list_page():
|
||||
with open("output/bot_edit_plan_add_tags.html", 'w') as outfile:
|
||||
outfile.write(html_bot_edit_prefix())
|
||||
|
@ -75,6 +77,7 @@ def generate_bot_edit_list_page():
|
|||
outfile.write('<li><a target="_blank" href="' + osm_link + '">' + osm_link + '</a>' + " <code>" + escape_html(key) + "=" + value + "</code></li>\n")
|
||||
outfile.write(html_bot_edit_suffix())
|
||||
|
||||
|
||||
def contact_method():
|
||||
return """Please <a target="_blank" href="https://codeberg.org/matkoniecz/improving_openstreetmap_using_alltheplaces_dataset/issues">create an issue</a> or <a target="_blank" href="https://www.openstreetmap.org/message/new/Mateusz%20Konieczny">send me an OSM private message</a> if you see a potential for improvements. If potential improvements are in All The Places - better to create PR or issue there. If unsure, please write to me. If you see this data being misued and causing harm (for example, imported without consulting community or ignoring their feedback) - please write to me and I will help with cleanups, including reverts and reconsider how this data is publish."""
|
||||
|
||||
|
@ -109,6 +112,7 @@ def html_bot_edit_prefix():
|
|||
<h2>Edit ideas listing</h2>
|
||||
"""
|
||||
|
||||
|
||||
def html_bot_edit_suffix():
|
||||
return """</section>
|
||||
</body>
|
||||
|
@ -121,12 +125,13 @@ def clear_output_files(folder):
|
|||
if os.path.isfile(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
|
||||
def produce_map_analysis_for_atp_code(atp_code, cache_only, url_checker_instance, report_generators):
|
||||
csv_filepath = "build_temporary_files/" + atp_code + '.csv'
|
||||
if os.path.isfile(csv_filepath) == False:
|
||||
report_generators[atp_code] = {
|
||||
'atp_file_is_broken': True
|
||||
}
|
||||
}
|
||||
return report_generators
|
||||
match_list = serializing.load_list_of_matches_from_csv(csv_filepath)
|
||||
|
||||
|
@ -142,12 +147,13 @@ def produce_map_analysis_for_atp_code(atp_code, cache_only, url_checker_instance
|
|||
processed = qa.remove_bad_data(entry.atp_tags, atp_code)
|
||||
if processed == None:
|
||||
continue
|
||||
entry.atp_tags = processed # TODO is it happening as data was passed to qa function
|
||||
entry.atp_tags = processed # TODO is it happening as data was passed to qa function
|
||||
rebuild_match_list.append(entry)
|
||||
|
||||
report_generators[atp_code] = produce_map_analysis_for_atp_data(atp_code, area_name="", match_list=rebuild_match_list, cache_only=cache_only, url_checker_instance=url_checker_instance)
|
||||
return report_generators
|
||||
|
||||
|
||||
class MismatchingNameReportCreator:
|
||||
def __init__(self, atp_code, area_name):
|
||||
self.atp_code = atp_code
|
||||
|
@ -227,9 +233,9 @@ class MismatchingNameReportCreator:
|
|||
def table_of_contents(self):
|
||||
return [
|
||||
{
|
||||
'header': "Name mismatch",
|
||||
'section_link': section_link('name mismatch between OSM and ATP', len(self.completely_mismatching_names), self.report_filename()),
|
||||
'output_files': [self.report_filename()]
|
||||
'header': "Name mismatch",
|
||||
'section_link': section_link('name mismatch between OSM and ATP', len(self.completely_mismatching_names), self.report_filename()),
|
||||
'output_files': [self.report_filename()]
|
||||
},
|
||||
]
|
||||
|
||||
|
@ -247,6 +253,7 @@ class MismatchingNameReportCreator:
|
|||
#outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], missing.osm_match_center['lat'], missing.osm_match_center['lon'], color = 'blue'))
|
||||
outfile.write(leafleter.generator.get_html_page_suffix())
|
||||
|
||||
|
||||
class ATPGivesTagsReportCreator:
|
||||
def __init__(self, url_checker_instance, atp_code, area_name):
|
||||
self.atp_code = atp_code
|
||||
|
@ -281,7 +288,7 @@ class ATPGivesTagsReportCreator:
|
|||
writer = csv.writer(outfile)
|
||||
for key in self.importable_keys:
|
||||
if self.count_of_total_tag_mismatches[key] == 0:
|
||||
for entry in self.shops_with_tags_to_be_added: # already Nominatim-filtered
|
||||
for entry in self.shops_with_tags_to_be_added: # already Nominatim-filtered
|
||||
if key in entry['tags_to_be_added']:
|
||||
value = entry['entry'].atp_tags[key]
|
||||
atp_code = entry['entry'].atp_tags['@spider']
|
||||
|
@ -344,7 +351,7 @@ class ATPGivesTagsReportCreator:
|
|||
return False
|
||||
returned = self.url_checker.is_website_eligible(atp, cache_only)
|
||||
if returned == None:
|
||||
return False # not cached, instructed to use only cache
|
||||
return False # not cached, instructed to use only cache
|
||||
if returned:
|
||||
if atp.atp_tags.get(tested_key) != atp.osm_match_tags.get(tested_key):
|
||||
return True
|
||||
|
@ -356,7 +363,7 @@ class ATPGivesTagsReportCreator:
|
|||
# pointing to the main brand page
|
||||
pass
|
||||
elif self.url_checker.is_difference_limited_to_slash_at_end(atp_value, osm_value):
|
||||
pass # effectively the same anyway, no real mismatch
|
||||
pass # effectively the same anyway, no real mismatch
|
||||
else:
|
||||
self.mismatching_website_tags.append(atp)
|
||||
self.report_mismatch(atp, tested_key)
|
||||
|
@ -402,7 +409,7 @@ class ATPGivesTagsReportCreator:
|
|||
new_tags[key] = entry['entry'].atp_tags[key]
|
||||
message += tag_list_to_html(new_tags)
|
||||
outfile.write(leafleter.generator.get_marker(message, entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], color='green'))
|
||||
outfile.write(leafleter.generator.get_line(entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], entry['entry'].osm_match_center['lat'], entry['entry'].osm_match_center['lon'], color = 'green'))
|
||||
outfile.write(leafleter.generator.get_line(entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], entry['entry'].osm_match_center['lat'], entry['entry'].osm_match_center['lon'], color='green'))
|
||||
outfile.write(leafleter.generator.get_html_page_suffix())
|
||||
|
||||
def generate_mismatching_website_listing(self):
|
||||
|
@ -422,9 +429,10 @@ class ATPGivesTagsReportCreator:
|
|||
summary += 'tag list as suggested by ATP (should not be assumed to be directly usable in OSM):<br><br>'
|
||||
summary += tag_list_to_html(bad.atp_tags)
|
||||
outfile.write(leafleter.generator.get_marker(summary, bad.atp_center['lat'], bad.atp_center['lon'], color='red'))
|
||||
outfile.write(leafleter.generator.get_line(bad.atp_center['lat'], bad.atp_center['lon'], bad.osm_match_center['lat'], bad.osm_match_center['lon'], color = 'red'))
|
||||
outfile.write(leafleter.generator.get_line(bad.atp_center['lat'], bad.atp_center['lon'], bad.osm_match_center['lat'], bad.osm_match_center['lon'], color='red'))
|
||||
outfile.write(leafleter.generator.get_html_page_suffix())
|
||||
|
||||
|
||||
class MissingObjectsReportCreator:
|
||||
def __init__(self, atp_code, area_name):
|
||||
self.area_name = area_name
|
||||
|
@ -488,6 +496,7 @@ class MissingObjectsReportCreator:
|
|||
bad_tags_skipped.append(atp)
|
||||
json.dump(serializing.generate_geojson_structure(bad_tags_skipped), f)
|
||||
|
||||
|
||||
class NominatimMismatchReportCreator:
|
||||
def __init__(self, atp_code, area_name):
|
||||
self.area_name = area_name
|
||||
|
@ -546,9 +555,10 @@ class NominatimMismatchReportCreator:
|
|||
summary += tag_list_to_html(missing.atp_tags)
|
||||
outfile.write(leafleter.generator.get_marker(summary, missing.atp_center['lat'], missing.atp_center['lon'], color='red'))
|
||||
location_from_nominatim = nominatim.location_given_tags(missing.atp_tags, debug_identifier=self.atp_code)[0]
|
||||
outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], location_from_nominatim['lat'], location_from_nominatim['lon'], color = 'red'))
|
||||
outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], location_from_nominatim['lat'], location_from_nominatim['lon'], color='red'))
|
||||
outfile.write(leafleter.generator.get_html_page_suffix())
|
||||
|
||||
|
||||
"""
|
||||
class MismatchingNameReportCreator:
|
||||
def __init__(self, atp_code):
|
||||
|
@ -577,6 +587,8 @@ add to test_display_website
|
|||
"""
|
||||
# TODO: passing atp_code should not be needed
|
||||
# TODO: save files one level higher, here just produce analysis
|
||||
|
||||
|
||||
def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_only, url_checker_instance):
|
||||
missing_objects_report = MissingObjectsReportCreator(atp_code, area_name)
|
||||
mismatching_name_report = MismatchingNameReportCreator(atp_code, area_name)
|
||||
|
@ -613,7 +625,7 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl
|
|||
conflict_between_atp_and_nominatim_report.register_case_using_known_nominatim_status(atp, status)
|
||||
elif atp.match_distance == None or atp.match_distance > config.missing_shop_distance_in_kilometers_for_specific_case(atp.atp_tags):
|
||||
nominatim_match = nominatim.is_location_matching_tags(atp.atp_tags, atp.atp_center, cache_only=cache_only, spider=atp_code)
|
||||
if nominatim_match != False: # both matches, failed geolocation and geolocation not done at all go here
|
||||
if nominatim_match != False: # both matches, failed geolocation and geolocation not done at all go here
|
||||
missing_objects_report.check_case(atp)
|
||||
conflict_between_atp_and_nominatim_report.register_case_using_known_nominatim_status(atp, nominatim_match)
|
||||
else:
|
||||
|
@ -635,7 +647,8 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl
|
|||
'missing_objects_report': missing_objects_report,
|
||||
'conflict_between_atp_and_nominatim_report': conflict_between_atp_and_nominatim_report,
|
||||
'total_atp_entries': len(match_list),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def format_for_geojson_export(dataset):
|
||||
for entry in dataset:
|
||||
|
@ -644,6 +657,7 @@ def format_for_geojson_export(dataset):
|
|||
del atp.atp_tags[key]
|
||||
return dataset
|
||||
|
||||
|
||||
def get_center(dataset):
|
||||
max_lat = -90
|
||||
max_lon = -180
|
||||
|
@ -671,6 +685,7 @@ def sidebar_content(page_specific_info, atp_code):
|
|||
sidebar += '<br><br>\n<a href="https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/' + atp_code + '.py" target="_blank">atp source code</a>'
|
||||
return sidebar
|
||||
|
||||
|
||||
def tag_list_to_html(tags):
|
||||
returned = ""
|
||||
normal_tags = ""
|
||||
|
@ -691,21 +706,26 @@ def tag_list_to_html(tags):
|
|||
returned += "<br><br>tags present in ATP, very likely not usable directly in OSM<br>" + dropped_tags
|
||||
return returned
|
||||
|
||||
|
||||
def htmlify_key_value_pair(key, value):
|
||||
return key + " = " + htmlify_value(key, value) + "<br>"
|
||||
|
||||
|
||||
def htmlify_value(key, value):
|
||||
value = escape_html(value)
|
||||
if key == "website" or (key == "image" and value.find("http") == 0):
|
||||
value = '<a href="' + value + '">' + value + "</a>"
|
||||
return value
|
||||
|
||||
|
||||
def escape_url(value):
|
||||
return str(value).replace('"', '%22').replace("'", "%27")
|
||||
|
||||
|
||||
def escape_html(value):
|
||||
return html.escape(value).replace("\r\n", "<br>").replace("\n", "<br>")
|
||||
|
||||
|
||||
def headers():
|
||||
# TODO: pass it smarter in config (list of main report creators?)
|
||||
# or at least make it static method
|
||||
|
@ -718,6 +738,7 @@ def headers():
|
|||
NominatimMismatchReportCreator('dummy', 'dummy area name').table_of_contents()[0]['header'],
|
||||
]
|
||||
|
||||
|
||||
def generate_website_index_listing_by_country(report_generators, released_codes_by_region, partial=False):
|
||||
with open("output/index.html", 'w') as outfile:
|
||||
outfile.write(html_prefix())
|
||||
|
@ -732,6 +753,7 @@ def generate_website_index_listing_by_country(report_generators, released_codes_
|
|||
outfile.write(table_with_spider_overview(atp_codes, report_generators, partial))
|
||||
outfile.write(html_suffix())
|
||||
|
||||
|
||||
def generate_website_index_for_named_area(report_generators, area_name, partial=False):
|
||||
with open("output/" + area_name + "_index.html", 'w') as outfile:
|
||||
outfile.write(html_prefix())
|
||||
|
@ -739,6 +761,7 @@ def generate_website_index_for_named_area(report_generators, area_name, partial=
|
|||
outfile.write(table_with_spider_overview(report_generators.keys(), report_generators, partial))
|
||||
outfile.write(html_suffix())
|
||||
|
||||
|
||||
def table_with_spider_overview(atp_codes, report_generators, partial):
|
||||
returned = ""
|
||||
returned += '<table class="statistics-summary"><thead><tr><th>' + '</th><th>'.join(headers()) + '</th></tr></thead>\n'
|
||||
|
@ -756,8 +779,9 @@ def table_with_spider_overview(atp_codes, report_generators, partial):
|
|||
returned += "no entries shown in this area\n"
|
||||
return returned
|
||||
|
||||
|
||||
def table_row(atp_code, statistics):
|
||||
if statistics['missing_objects_report'] == None: #TODO test is it working
|
||||
if statistics['missing_objects_report'] == None: # TODO test is it working
|
||||
return '<tr><th></th><td colspan="5">Data missing</td></tr>'
|
||||
|
||||
missing_section = statistics['missing_objects_report'].table_of_contents()[0]['section_link']
|
||||
|
@ -772,11 +796,13 @@ def table_row(atp_code, statistics):
|
|||
|
||||
return '<tr><th>' + atp_code + '</th><td>' + missing_section + '</td><td>' + mismatching_names_section + '</td><td>' + tags_section + '</td><td>' + website_mismatch_section + '</td><td>' + mismatch_section + not_attempted + '</td></tr>'
|
||||
|
||||
|
||||
def section_link(description, count, page):
|
||||
if count == 0:
|
||||
return '<span class=less-visible title="' + description + '">' + str(count) + '</span>'
|
||||
return '<a href="' + page + '" title="' + description + '">' + str(count) + '</a>'
|
||||
|
||||
|
||||
def contact_method():
|
||||
return """Please <a href="https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data/issues">create an issue</a> or <a href="https://www.openstreetmap.org/message/new/Mateusz%20Konieczny">send me an OSM private message</a> if you see a potential for improvements. If potential improvements are in All The Places - better to create PR or issue <a href="https://github.com/alltheplaces/alltheplaces">there</a>. If unsure, please write to me."""
|
||||
|
||||
|
@ -819,11 +845,13 @@ def html_prefix():
|
|||
</p>
|
||||
"""
|
||||
|
||||
|
||||
def html_suffix():
|
||||
return """<hr><br>Published on <a href="https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/">https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/</a> - generated on """ + f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S%z}' + """ (note that ATP and OSM data used here may be older) </section>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
def iterate_over_output_files(atp_code):
|
||||
reports = [
|
||||
MissingObjectsReportCreator(atp_code, 'dummy area name'),
|
||||
|
@ -836,6 +864,7 @@ def iterate_over_output_files(atp_code):
|
|||
for file in entry['output_files']:
|
||||
yield file
|
||||
|
||||
|
||||
def copy_data_for_publication(all_atp_codes):
|
||||
for atp_code in all_atp_codes:
|
||||
if get_free_space_in_mb('../public_website_with_output') < 400:
|
||||
|
@ -850,8 +879,10 @@ def copy_data_for_publication(all_atp_codes):
|
|||
os.system("cp output/index.html ../public_website_with_output/index.html")
|
||||
# published on https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/
|
||||
|
||||
|
||||
def publish_data_on_internet():
|
||||
os.system('cd ../public_website_with_output && git add . && git commit -m "automatic update" && git push')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -10,20 +10,25 @@ import shared
|
|||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def is_in_this_area(area, atp):
|
||||
if atp.atp_center['lat'] > area['min_lat'] and atp.atp_center['lat'] < area['max_lat']:
|
||||
if atp.atp_center['lon'] > area['min_lon'] and atp.atp_center['lon'] < area['max_lon']:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def areas():
|
||||
return {
|
||||
'kraków': {'min_lat': 50, 'min_lon': 19.5, 'max_lat': 50.3, 'max_lon': 20.5},
|
||||
# http://bboxfinder.com/#52.383301,16.885986,52.436182,17.044859
|
||||
'poznań': {'min_lat': 52.383301, 'min_lon': 16.885986, 'max_lat': 52.436182, 'max_lon': 17.044859},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
general_statistics = {}
|
||||
|
||||
|
||||
def main():
|
||||
for atp_code, _item_path in obtain_atp_data.all_spider_codes_iterator():
|
||||
print(atp_code)
|
||||
|
@ -55,6 +60,7 @@ def save_files(data, name):
|
|||
with open(name + '_missing.kml', 'w') as f:
|
||||
f.write(serializing.generate_kml_text(data))
|
||||
|
||||
|
||||
def clear_output_files(folder):
|
||||
for filename in os.listdir(folder):
|
||||
file_path = os.path.join(folder, filename)
|
||||
|
@ -71,4 +77,5 @@ def generate_missing_shop_listing(atp_code, apparently_missing_shops):
|
|||
osm_location_link = shared.link_to_point_in_osm(missing.atp_center['lat'], missing.atp_center['lon'])
|
||||
summary = 'here ATP shows object being present, which seems not mapped in OpenStreetMap (<a href="' + osm_location_link + '">location</a>):<br><br>'
|
||||
|
||||
|
||||
main()
|
||||
|
|
|
@ -23,11 +23,13 @@ def graticule_id(lat, lon, lat_span, lon_span, margin_in_kilometers):
|
|||
# filter data for each
|
||||
# filter data for each in constant time (just check is given location within graticule range)
|
||||
|
||||
|
||||
def main():
|
||||
check_is_any_graticule_having_margin_greater_than_entire_graticule()
|
||||
generate_test_graticule_coverage_map()
|
||||
test_area_run()
|
||||
|
||||
|
||||
def generate_test_graticule_coverage_map():
|
||||
graticule_anchor_coverage = {'min_lat': 49, 'min_lon': 14, 'max_lat': 54, 'max_lon': 24}
|
||||
with open("test_coverage_graticule_display.html", 'w') as outfile:
|
||||
|
@ -37,9 +39,10 @@ def generate_test_graticule_coverage_map():
|
|||
for lat_anchor in range(graticule_anchor_coverage['min_lat'], graticule_anchor_coverage['max_lat'] + 1):
|
||||
for lon_anchor in range(graticule_anchor_coverage['min_lon'], graticule_anchor_coverage['max_lon'] + 1):
|
||||
shape = [[lat_anchor + 1, lon_anchor + 1], [lat_anchor + 1, lon_anchor], [lat_anchor, lon_anchor], [lat_anchor, lon_anchor + 1], [lat_anchor + 1, lon_anchor + 1]]
|
||||
outfile.write(leafleter.generator.get_polygon(shape, color = "green", fill_color = "green", link = "https://pl.wikipedia.org/wiki/Pozna%C5%84"))
|
||||
outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link="https://pl.wikipedia.org/wiki/Pozna%C5%84"))
|
||||
outfile.write(leafleter.generator.get_html_page_suffix())
|
||||
|
||||
|
||||
def test_area_run():
|
||||
# http://bboxfinder.com/#52.383301,16.885986,52.436182,17.044859
|
||||
poznań = {'min_lat': 52.383301, 'min_lon': 16.885986, 'max_lat': 52.436182, 'max_lon': 17.044859, 'name': 'Poznań'}
|
||||
|
@ -57,7 +60,7 @@ def test_area_run():
|
|||
outfile.write(leafleter.generator.get_html_page_prefix("website title", (area['max_lat'] + area['min_lat'])/2, (area['max_lon'] + area['min_lon'])/2))
|
||||
#outfile.write(leafleter.generator.get_marker("text", 50.06, 19.93))
|
||||
shape = [[area['max_lat'], area['max_lon']], [area['max_lat'], area['min_lon']], [area['min_lat'], area['min_lon']], [area['min_lat'], area['max_lon']], [area['max_lat'], area['max_lon']]]
|
||||
outfile.write(leafleter.generator.get_polygon(shape, color = "green", fill_color = "green", link = "https://pl.wikipedia.org/wiki/Pozna%C5%84"))
|
||||
outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link="https://pl.wikipedia.org/wiki/Pozna%C5%84"))
|
||||
outfile.write(leafleter.generator.get_html_page_suffix())
|
||||
|
||||
atp_data_by_spider = {}
|
||||
|
@ -72,13 +75,12 @@ def test_area_run():
|
|||
if len(gathered) > 0:
|
||||
atp_data_by_spider[atp_code] = gathered
|
||||
|
||||
|
||||
general_area = "europe/poland"
|
||||
osm_data = []
|
||||
for entry in matcher.load_geofabrik(general_area, config.cache_folder()):
|
||||
if entry['center']['lat'] > area['min_lat'] and entry['center']['lat'] < area['max_lat']:
|
||||
if entry['center']['lon'] > area['min_lon'] and entry['center']['lon'] < area['max_lon']:
|
||||
osm_data.append(entry)
|
||||
if entry['center']['lat'] > area['min_lat'] and entry['center']['lat'] < area['max_lat']:
|
||||
if entry['center']['lon'] > area['min_lon'] and entry['center']['lon'] < area['max_lon']:
|
||||
osm_data.append(entry)
|
||||
|
||||
print(len(atp_data_by_spider))
|
||||
print(len(osm_data))
|
||||
|
@ -100,6 +102,7 @@ def test_area_run():
|
|||
|
||||
print(output_file)
|
||||
|
||||
|
||||
def check_is_any_graticule_having_margin_greater_than_entire_graticule():
|
||||
for lat in range(-89, 89):
|
||||
for lon in range(-180, 180):
|
||||
|
@ -110,17 +113,17 @@ def check_is_any_graticule_having_margin_greater_than_entire_graticule():
|
|||
distance_for_lat_degree_alt = shared.calculate_distance(
|
||||
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1},
|
||||
{'lat': tested_location['lat'] + 2, 'lon': tested_location['lon'] + 1}
|
||||
)
|
||||
)
|
||||
distance_for_lat_degree_alt_alt = shared.calculate_distance(
|
||||
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon']},
|
||||
{'lat': tested_location['lat'] + 2, 'lon': tested_location['lon']}
|
||||
)
|
||||
)
|
||||
print("expected zero, maybe espilon changes", distance_for_lat_degree_alt - distance_for_lat_degree)
|
||||
print("expected zero, maybe espilon changes", distance_for_lat_degree_alt_alt - distance_for_lat_degree)
|
||||
distance_for_lon_degree_alt = shared.calculate_distance(
|
||||
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1},
|
||||
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1}
|
||||
)
|
||||
)
|
||||
print("expected meaningful changes", distance_for_lon_degree_alt - distance_for_lon_degree)
|
||||
|
||||
margin_in_kilometers = config.maximum_missing_shop_distance_in_kilometers()
|
||||
|
@ -133,4 +136,5 @@ def check_is_any_graticule_having_margin_greater_than_entire_graticule():
|
|||
raise
|
||||
break
|
||||
|
||||
|
||||
main()
|
||||
|
|
|
@ -33,7 +33,7 @@ def main():
|
|||
except FileNotFoundError as e:
|
||||
print(e)
|
||||
pass
|
||||
#TODO skip freeform/valid ones
|
||||
# TODO skip freeform/valid ones
|
||||
for key, values in used_tags.items():
|
||||
if tag_knowledge.is_freeform_key(key):
|
||||
print(key, "=", "*")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
|
||||
|
||||
class MatchDistanceDestributionReportCreator:
|
||||
def __init__(self, identifier, area_name):
|
||||
self.identifier = identifier
|
||||
|
@ -29,7 +30,7 @@ class MatchDistanceDestributionReportCreator:
|
|||
plt.rcParams["figure.figsize"] = [10, 10]
|
||||
# https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
|
||||
# see 02 file for more investigation
|
||||
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
|
||||
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
|
||||
plt.grid(True)
|
||||
plt.clf()
|
||||
plt.xlim(0, 1200)
|
||||
|
@ -39,7 +40,7 @@ class MatchDistanceDestributionReportCreator:
|
|||
plt.rcParams["figure.figsize"] = [10, 10]
|
||||
# https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
|
||||
# see 02 file for more investigation
|
||||
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
|
||||
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
|
||||
plt.grid(True)
|
||||
plt.clf()
|
||||
plt.xlim(0, 300)
|
||||
|
|
|
@ -4,6 +4,7 @@ import datetime
|
|||
import time
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def scan_eligible(grab_bag, scanner):
|
||||
while True:
|
||||
any_scanned = False
|
||||
|
@ -18,6 +19,7 @@ def scan_eligible(grab_bag, scanner):
|
|||
if any_scanned == False:
|
||||
return
|
||||
|
||||
|
||||
def main():
|
||||
wait_between_the_same_domain_minutes = 5
|
||||
grab_bag = {}
|
||||
|
@ -38,5 +40,6 @@ def main():
|
|||
scan_eligible(grab_bag, scanner)
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
35
nominatim.py
35
nominatim.py
|
@ -6,24 +6,27 @@ import re
|
|||
import shutil
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def cache_path():
|
||||
return 'nominatim_cache'
|
||||
|
||||
|
||||
# Initialize disk cache
|
||||
nominatim_cache = diskcache.Cache(cache_path())
|
||||
|
||||
|
||||
def drop_extra_detail_blocking_nominatim(value):
|
||||
# patch nominatim bug where inclusion of apartment code breaks search
|
||||
# https://github.com/osm-search/Nominatim/issues/145#issuecomment-2143549199
|
||||
# see https://pythex.org/ for testing
|
||||
value = re.sub(r'/\d+([a-zA-Z])?', '', value) # turns 178/12 into 178
|
||||
value = re.sub(r'(,|, |)lok\..*', '', value, flags=re.IGNORECASE) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)LOK .*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)lokal .*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)lok .*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)lok.*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)LU.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
|
||||
value = re.sub(r'(,|, |)Lu.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
|
||||
value = re.sub(r'/\d+([a-zA-Z])?', '', value) # turns 178/12 into 178
|
||||
value = re.sub(r'(,|, |)lok\..*', '', value, flags=re.IGNORECASE) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)LOK .*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)lokal .*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)lok .*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)lok.*', '', value) # "lokal" is Polish for "unit"
|
||||
value = re.sub(r'(,|, |)LU.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
|
||||
value = re.sub(r'(,|, |)Lu.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
|
||||
|
||||
value = re.sub(r'(,|, |)suite .*', '', value, flags=re.IGNORECASE)
|
||||
|
||||
|
@ -34,6 +37,7 @@ def drop_extra_detail_blocking_nominatim(value):
|
|||
value = re.sub(r'(,|, |)unit .*', '', value, flags=re.IGNORECASE)
|
||||
return value
|
||||
|
||||
|
||||
def nominatim_queries(tags, debug=False):
|
||||
address_tag_groups = [
|
||||
['addr:country', 'addr:city', 'addr:street', 'addr:housenumber'],
|
||||
|
@ -67,7 +71,7 @@ def nominatim_queries(tags, debug=False):
|
|||
if key in ["addr:street_address", 'addr:street', 'addr:full']:
|
||||
# see https://github.com/osm-search/Nominatim/issues/87
|
||||
value = re.sub(r'ul\. ?', '', value, flags=re.IGNORECASE)
|
||||
value = re.sub(r'( |$)ul ', ' ', value, flags=re.IGNORECASE) # "ul Żabia"
|
||||
value = re.sub(r'( |$)ul ', ' ', value, flags=re.IGNORECASE) # "ul Żabia"
|
||||
if key in ["addr:street_address", 'addr:full']:
|
||||
value = drop_extra_detail_blocking_nominatim(value)
|
||||
query += value
|
||||
|
@ -76,6 +80,7 @@ def nominatim_queries(tags, debug=False):
|
|||
print(group)
|
||||
yield query
|
||||
|
||||
|
||||
def location_given_tags_cache_only(tags):
|
||||
"""
|
||||
True: matches
|
||||
|
@ -87,21 +92,22 @@ def location_given_tags_cache_only(tags):
|
|||
if query not in nominatim_cache:
|
||||
with open(config.nominatim_requests_missing_from_cache(), 'a') as outfile:
|
||||
outfile.write(query+"\n")
|
||||
return -1 # maybe transformed query would give better result?
|
||||
# should not check further ones
|
||||
return -1 # maybe transformed query would give better result?
|
||||
# should not check further ones
|
||||
else:
|
||||
response = nominatim_cache[query]
|
||||
if len(response) >= 1:
|
||||
return response
|
||||
return None
|
||||
|
||||
|
||||
def location_given_tags(tags, debug_identifier):
|
||||
for query in nominatim_queries(tags):
|
||||
response = query_nominatim(query)
|
||||
if len(response) >= 1:
|
||||
return response
|
||||
|
||||
atp_code = debug_identifier # TODO handle this
|
||||
atp_code = debug_identifier # TODO handle this
|
||||
if config.is_failed_geocoding_worth_mentioning(atp_code):
|
||||
print()
|
||||
print()
|
||||
|
@ -119,6 +125,7 @@ def location_given_tags(tags, debug_identifier):
|
|||
print()
|
||||
return None
|
||||
|
||||
|
||||
def is_location_matching_tags(tags, center, spider, cache_only=False):
|
||||
"""
|
||||
True: matches
|
||||
|
@ -135,6 +142,7 @@ def is_location_matching_tags(tags, center, spider, cache_only=False):
|
|||
return response
|
||||
return are_locations_matching(tags, response[0], center)
|
||||
|
||||
|
||||
def are_locations_matching(tags, location, center):
|
||||
distance = shared.calculate_distance(center, location)
|
||||
if distance > config.missing_shop_distance_in_kilometers_for_specific_case(tags):
|
||||
|
@ -142,10 +150,12 @@ def are_locations_matching(tags, location, center):
|
|||
else:
|
||||
return True
|
||||
|
||||
|
||||
def get_free_space_in_mb(path):
|
||||
total, used, free = shutil.disk_usage(path)
|
||||
return free / 1024 / 1024
|
||||
|
||||
|
||||
def query_nominatim(query):
|
||||
# Check if the response is in the cache
|
||||
if query in nominatim_cache:
|
||||
|
@ -221,6 +231,7 @@ def query_nominatim(query):
|
|||
else:
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# Example usage
|
||||
# gptchat generated
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import nominatim
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def main():
|
||||
with open(config.nominatim_requests_missing_from_cache()) as fp:
|
||||
for query in fp:
|
||||
nominatim.query_nominatim(query.strip())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
36
qa.py
36
qa.py
|
@ -1,8 +1,9 @@
|
|||
config = __import__("0_config")
|
||||
import shops
|
||||
import rich
|
||||
import phonenumbers
|
||||
import datetime
|
||||
import phonenumbers
|
||||
import rich
|
||||
import shops
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
def remove_bad_data(data, atp_code):
|
||||
"""
|
||||
|
@ -94,6 +95,7 @@ def remove_bad_data(data, atp_code):
|
|||
del data[key]
|
||||
return data
|
||||
|
||||
|
||||
def is_empty_value(key, value, atp_code):
|
||||
if value.lower() in ["undefined", "b/n", "---", "none", "n/a"]:
|
||||
if config.is_null_specified_as_text_worth_mentioning(atp_code):
|
||||
|
@ -114,11 +116,12 @@ def is_empty_value(key, value, atp_code):
|
|||
return True
|
||||
return False
|
||||
|
||||
|
||||
def handle_ref_tag(data, atp_code):
|
||||
if atp_code in ['paczkomat_inpost_pl', 'allegro_one_box_pl']:
|
||||
return data # actual ref
|
||||
return data # actual ref
|
||||
elif atp_code in ['credit_agricole_pl']:
|
||||
del data["ref"] # synthethic ref created by ATP
|
||||
del data["ref"] # synthethic ref created by ATP
|
||||
elif "ref" in data:
|
||||
# https://github.com/alltheplaces/alltheplaces/blob/master/DATA_FORMAT.md describe `ref` and I am a bit confused
|
||||
# > A unique identifier for this feature inside this spider. The code that generates the output will remove duplicates based on the value of this key.
|
||||
|
@ -137,6 +140,7 @@ def handle_ref_tag(data, atp_code):
|
|||
del data["ref"]
|
||||
return data
|
||||
|
||||
|
||||
def remove_bad_phone_data(data, atp_code):
|
||||
if 'phone' in data:
|
||||
if data['phone'].replace(" ", "").startswith("+443"):
|
||||
|
@ -157,6 +161,7 @@ def remove_bad_phone_data(data, atp_code):
|
|||
del data['phone']
|
||||
return data
|
||||
|
||||
|
||||
def is_valid_phone_tag(phone_tag):
|
||||
if ";" not in phone_tag:
|
||||
return is_valid_phone_number(phone_tag)
|
||||
|
@ -166,13 +171,14 @@ def is_valid_phone_tag(phone_tag):
|
|||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_valid_phone_number(phone):
|
||||
if phone in [
|
||||
'+4800000000', # https://github.com/alltheplaces/alltheplaces/issues/8633
|
||||
]:
|
||||
return False
|
||||
try:
|
||||
parsed = phonenumbers.parse(phone, None)
|
||||
return phonenumbers.is_valid_number(parsed)
|
||||
except phonenumbers.phonenumberutil.NumberParseException:
|
||||
return False
|
||||
if phone in [
|
||||
'+4800000000', # https://github.com/alltheplaces/alltheplaces/issues/8633
|
||||
]:
|
||||
return False
|
||||
try:
|
||||
parsed = phonenumbers.parse(phone, None)
|
||||
return phonenumbers.is_valid_number(parsed)
|
||||
except phonenumbers.phonenumberutil.NumberParseException:
|
||||
return False
|
||||
|
|
2
run.py
2
run.py
|
@ -5,6 +5,7 @@ obtain_atp_data = __import__("2_obtain_atp_data")
|
|||
matcher = __import__("3_matcher")
|
||||
show_data = __import__("4_show_data")
|
||||
|
||||
|
||||
def main():
|
||||
# TODO: test dependencies on fresh OS
|
||||
# see readme for instructions how to install dependencies
|
||||
|
@ -27,6 +28,7 @@ def main():
|
|||
# maps listing various missing data - shops, tags, and of various wrong data (shop in OSM not in ATP and so on)
|
||||
show_data.main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ import base64
|
|||
import json
|
||||
import csv
|
||||
|
||||
|
||||
class Match:
|
||||
def __init__(self, atp_center, atp_tags, osm_match_center, osm_match_tags, osm_link, match_distance, all_very_good_matches):
|
||||
self.atp_center = atp_center
|
||||
|
@ -11,16 +12,18 @@ class Match:
|
|||
self.osm_link = osm_link
|
||||
self.match_distance = match_distance
|
||||
self.all_very_good_matches = all_very_good_matches
|
||||
|
||||
def __str__(self):
|
||||
return "Match(" + str(self.atp_center) + ',' + str(self.atp_tags) + ',' + str(self.osm_match_center) + ',' + str(self.osm_match_tags) + ',' + str(self.osm_link) + ',' + str(self.match_distance) + ',' + str(self.all_very_good_matches) + ")"
|
||||
|
||||
|
||||
def save_list_of_matches_to_csv(filepath, data):
|
||||
with open(filepath, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['atp_lat', 'atp_lon', 'atp_tags_dict_in_base64', 'osm_lat', 'osm_lon', 'osm_tags_dict_in_base64', 'osm_link', 'match_distance', 'all_very_good_matches'])
|
||||
for entry in data:
|
||||
if entry.match_distance == None:
|
||||
writer.writerow([entry.atp_center['lat'], entry.atp_center['lon'], encode_to_base64_via_json(entry.atp_tags),"","","","","", ""])
|
||||
writer.writerow([entry.atp_center['lat'], entry.atp_center['lon'], encode_to_base64_via_json(entry.atp_tags), "", "", "", "", "", ""])
|
||||
else:
|
||||
writer.writerow([
|
||||
entry.atp_center['lat'],
|
||||
|
@ -32,7 +35,8 @@ def save_list_of_matches_to_csv(filepath, data):
|
|||
entry.osm_link,
|
||||
entry.match_distance,
|
||||
encode_to_base64_via_json(entry.all_very_good_matches)
|
||||
])
|
||||
])
|
||||
|
||||
|
||||
def load_list_of_matches_from_csv(filepath):
|
||||
try:
|
||||
|
@ -52,7 +56,7 @@ def load_list_of_matches_from_csv(filepath):
|
|||
osm_match_center = {'lat': float(row[3]), 'lon': float(row[4])}
|
||||
osm_match_tags = decode_from_base64_via_json(row[5])
|
||||
for key, value in osm_match_tags.items():
|
||||
osm_match_tags[key] = str(value) # TODO - review saving code, this should not be needed
|
||||
osm_match_tags[key] = str(value) # TODO - review saving code, this should not be needed
|
||||
osm_link = row[6]
|
||||
match_distance = float(row[7])
|
||||
all_very_good_matches = decode_from_base64_via_json(row[8])
|
||||
|
@ -62,7 +66,9 @@ def load_list_of_matches_from_csv(filepath):
|
|||
print(filepath)
|
||||
raise
|
||||
|
||||
#gptchat generated
|
||||
# gptchat generated
|
||||
|
||||
|
||||
def encode_to_base64_via_json(input_dict):
|
||||
# Convert the dictionary to a JSON string
|
||||
json_str = json.dumps(input_dict)
|
||||
|
@ -74,7 +80,9 @@ def encode_to_base64_via_json(input_dict):
|
|||
base64_str = base64_bytes.decode('utf-8')
|
||||
return base64_str
|
||||
|
||||
#gptchat generated
|
||||
# gptchat generated
|
||||
|
||||
|
||||
def decode_from_base64_via_json(base64_str):
|
||||
# Decode the Base64 string to bytes
|
||||
base64_bytes = base64_str.encode('utf-8')
|
||||
|
@ -86,18 +94,20 @@ def decode_from_base64_via_json(base64_str):
|
|||
output_dict = json.loads(json_str)
|
||||
return output_dict
|
||||
|
||||
|
||||
def generate_geojson_structure(dataset):
|
||||
geojson_data = {"type": "FeatureCollection","features": []}
|
||||
geojson_data = {"type": "FeatureCollection", "features": []}
|
||||
for atp in dataset:
|
||||
geojson_data['features'].append({"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "Point",
|
||||
"coordinates": [atp.atp_center['lon'], atp.atp_center['lat']]
|
||||
},
|
||||
"properties": atp.atp_tags
|
||||
})
|
||||
"geometry": {
|
||||
"type": "Point",
|
||||
"coordinates": [atp.atp_center['lon'], atp.atp_center['lat']]
|
||||
},
|
||||
"properties": atp.atp_tags
|
||||
})
|
||||
return geojson_data
|
||||
|
||||
|
||||
def generate_kml_text(dataset):
|
||||
geojson_data = generate_geojson_structure(dataset)
|
||||
returned = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import geopy.distance
|
||||
|
||||
|
||||
def link_to_point_in_osm(lat, lon):
|
||||
return 'https://www.openstreetmap.org/?mlat=' + str(lat) + "&mlon=" + str(lon) + "#map=19/" + str(lat) + '/' + str(lon)
|
||||
|
||||
|
||||
def calculate_distance(point_a, point_b):
|
||||
# https://github.com/geopy/geopy?tab=readme-ov-file#measuring-distance
|
||||
coords_1 = (point_a['lat'], point_a['lon'])
|
||||
|
|
|
@ -14,4 +14,3 @@ class SpatialIndex:
|
|||
|
||||
# sort by longitude
|
||||
# select quickly by longitude, leaving unlimited for latitude
|
||||
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import distance_distribution
|
||||
import url_checker
|
||||
import leafleter
|
||||
import serializing
|
||||
import unittest
|
||||
show_data = __import__("4_show_data")
|
||||
import serializing
|
||||
import leafleter
|
||||
import url_checker
|
||||
import distance_distribution
|
||||
|
||||
|
||||
class IsCodeCompletelyCrashingSmoketests(unittest.TestCase):
|
||||
def test_rough_code_validity(self):
|
||||
|
@ -49,6 +50,7 @@ class IsCodeCompletelyCrashingSmoketests(unittest.TestCase):
|
|||
for file in show_data.iterate_over_output_files('dummy_atp_code'):
|
||||
pass
|
||||
|
||||
|
||||
class TagListFormattingTests(unittest.TestCase):
|
||||
def test_escaping_newlines(self):
|
||||
self.assertEqual(show_data.escape_html("ajaj\naaaa"), "ajaj<br>aaaa")
|
||||
|
@ -59,6 +61,7 @@ class TagListFormattingTests(unittest.TestCase):
|
|||
def test_tag_list_generation_newline_in_tags_escape(self):
|
||||
self.assertEqual("aaaa<br>bbb" in show_data.tag_list_to_html({"description": "aaaa\nbbb"}), True)
|
||||
|
||||
|
||||
class PhoneSuggestingTests(unittest.TestCase):
|
||||
def test_accept_normal_phone(self):
|
||||
add_tags_from_atp = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy identifier for tests', 'dummy area name')
|
||||
|
@ -88,6 +91,7 @@ class PhoneSuggestingTests(unittest.TestCase):
|
|||
creator = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy_atp_code', 'dummy area name')
|
||||
self.assertEqual(creator.is_phone_eligible(match), False)
|
||||
|
||||
|
||||
class WebsiteSuggestingTests(unittest.TestCase):
|
||||
def test_accept_normal_website(self):
|
||||
add_tags_from_atp = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy identifier for tests', 'dummy area name')
|
||||
|
|
|
@ -3,6 +3,7 @@ import link_scan_worker
|
|||
import run
|
||||
import unittest
|
||||
|
||||
|
||||
class SmokeTest(unittest.TestCase):
|
||||
def test_math(self):
|
||||
self.assertEqual(2+2, 4)
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import serializing
|
||||
import unittest
|
||||
matcher = __import__("3_matcher")
|
||||
config = __import__("0_config")
|
||||
import serializing
|
||||
|
||||
|
||||
class RealityTests(unittest.TestCase):
|
||||
def test_match_on_exact_match(self):
|
||||
|
@ -59,7 +60,6 @@ class RealityTests(unittest.TestCase):
|
|||
matches = matcher.get_matches(osm_data, atp_data)
|
||||
self.assertEqual(matches[0].match_distance, None)
|
||||
|
||||
|
||||
def test_accept_matches_for_ice_cream_synonyms(self):
|
||||
atp_data = [self.package_tags_into_mock({'brand': "Titan", 'amenity': 'ice_cream'})]
|
||||
osm_data = [self.package_tags_into_mock({'brand': "Titan", 'shop': 'ice_cream'})]
|
||||
|
@ -119,4 +119,4 @@ class RealityTests(unittest.TestCase):
|
|||
matches = matcher.get_matches(osm_data, atp_data)
|
||||
self.assertEqual(matches[0].match_distance, 0)
|
||||
|
||||
#TODO: how to handle shop=yes shop=vacant
|
||||
# TODO: how to handle shop=yes shop=vacant
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import unittest
|
||||
import qa
|
||||
|
||||
|
||||
class RealityTests(unittest.TestCase):
|
||||
def test_mathworks(self):
|
||||
self.assertEqual(2 + 1, 3)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import unittest
|
||||
import spatial_index
|
||||
|
||||
|
||||
class Tests(unittest.TestCase):
|
||||
def test_basic_match_for_single_entry(self):
|
||||
data = [
|
||||
|
@ -96,7 +97,7 @@ class Tests(unittest.TestCase):
|
|||
if entry["tags"] not in matches:
|
||||
matches[entry["tags"]] = 0
|
||||
matches[entry["tags"]] += 1
|
||||
self.assertEqual(matches, {4: 1, 5:1, 6:1, 7:1})
|
||||
self.assertEqual(matches, {4: 1, 5: 1, 6: 1, 7: 1})
|
||||
|
||||
def test_basic_match_for_all_entries_except_first(self):
|
||||
data = [
|
||||
|
@ -120,8 +121,7 @@ class Tests(unittest.TestCase):
|
|||
if entry["tags"] not in matches:
|
||||
matches[entry["tags"]] = 0
|
||||
matches[entry["tags"]] += 1
|
||||
self.assertEqual(matches, {4: 1, 5:1, 6:1, 7:1})
|
||||
|
||||
self.assertEqual(matches, {4: 1, 5: 1, 6: 1, 7: 1})
|
||||
|
||||
def test_basic_match_for_all_entries_except_last(self):
|
||||
data = [
|
||||
|
@ -144,4 +144,4 @@ class Tests(unittest.TestCase):
|
|||
if entry["tags"] not in matches:
|
||||
matches[entry["tags"]] = 0
|
||||
matches[entry["tags"]] += 1
|
||||
self.assertEqual(matches, {4: 1, 5:1, 6:1})
|
||||
self.assertEqual(matches, {4: 1, 5: 1, 6: 1})
|
||||
|
|
|
@ -6,6 +6,7 @@ import shutil
|
|||
import time
|
||||
config = __import__("0_config")
|
||||
|
||||
|
||||
class URLChecker():
|
||||
def __init__(self):
|
||||
"""
|
||||
|
@ -15,7 +16,7 @@ class URLChecker():
|
|||
that later should have been disposed but were not
|
||||
"""
|
||||
self.url_check_cache = diskcache.Cache(self.cache_path())
|
||||
urllib3.disable_warnings() # silences complaints about unverified requests via HTTPS
|
||||
urllib3.disable_warnings() # silences complaints about unverified requests via HTTPS
|
||||
# this is done to ignore complaints about "verify=False" in requests.get
|
||||
# this is not so terrible as I only check is website up
|
||||
# see https://stackoverflow.com/questions/78855740/starfield-ca-not-recoggnised-by-requests-package
|
||||
|
@ -89,7 +90,7 @@ class URLChecker():
|
|||
# https://salony.orange.pl/pl/orange-jastrz%C4%99bie-zdr%C3%B3j-galeria-zdr%C3%B3j-26882
|
||||
pass
|
||||
elif self.is_difference_limited_to_slash_at_end(atp_value, atp_after_redirect):
|
||||
pass # just adding trailing / is not worth raising an alarm... I think?
|
||||
pass # just adding trailing / is not worth raising an alarm... I think?
|
||||
else:
|
||||
self.consider_logging_that_atp_link_redirects(tested_key, atp_value, atp)
|
||||
return False
|
||||
|
@ -110,13 +111,13 @@ class URLChecker():
|
|||
if link_a[-1] == "/":
|
||||
link_a = link_a[:-1]
|
||||
if link_b[-1] == "/":
|
||||
link_b =link_b[:-1]
|
||||
link_b = link_b[:-1]
|
||||
return link_a == link_b
|
||||
|
||||
def consider_logging_that_atp_link_was_rejected(self, tested_key, atp_value, atp):
|
||||
if atp.atp_tags['@spider'] not in [
|
||||
'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415
|
||||
'true_value_us', # see above
|
||||
'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415
|
||||
'true_value_us', # see above
|
||||
]:
|
||||
pass
|
||||
#do not log problems as long as above issues are not fixed
|
||||
|
@ -125,8 +126,8 @@ class URLChecker():
|
|||
|
||||
def consider_logging_that_atp_link_redirects(self, tested_key, atp_value, atp):
|
||||
if atp.atp_tags["@spider"] not in [
|
||||
'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409
|
||||
'bevmo_us', # https://github.com/alltheplaces/alltheplaces/issues/9493
|
||||
'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409
|
||||
'bevmo_us', # https://github.com/alltheplaces/alltheplaces/issues/9493
|
||||
]:
|
||||
pass
|
||||
#do not log problems as long as above issues are not fixed
|
||||
|
@ -290,7 +291,7 @@ class URLChecker():
|
|||
'sobeys.ca',
|
||||
'zambrero.com',
|
||||
'zambrero.com.au'
|
||||
]:
|
||||
]:
|
||||
# handles also broken such as
|
||||
# website = ps://www.biedronka.pl
|
||||
for protocol in ["", "http://", "https://", "ps://"]:
|
||||
|
@ -302,7 +303,7 @@ class URLChecker():
|
|||
'https://www.circlek.pl/wyszukaj-stacje',
|
||||
'http://www.statoil.pl',
|
||||
'Biedronka.PL',
|
||||
'https://www.aldi-sued.de/de/homepage.html', # seems to be added by some ATP?
|
||||
'https://www.aldi-sued.de/de/homepage.html', # seems to be added by some ATP?
|
||||
'https://allegro.pl/kampania/one/znajdz-nas',
|
||||
'https://allegro.pl/kampania/one',
|
||||
'https://www.castorama.pl',
|
||||
|
@ -386,11 +387,10 @@ class URLChecker():
|
|||
if self.get_free_space_in_mb(self.cache_path()) < 400:
|
||||
raise Exception("running out of free space on drive")
|
||||
|
||||
|
||||
print(link, reason)
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
|
||||
}
|
||||
# NOTE: SSL verification checks are disabled
|
||||
# to keep https://aviastacjapaliw.pl/stacje/avia-protasy/ working
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import unittest
|
||||
import url_checker
|
||||
|
||||
|
||||
class LinkCheckingTests(unittest.TestCase):
|
||||
def test_link_rejector_rejecting_known_bad(self):
|
||||
test = url_checker.URLChecker()
|
||||
|
|
Loading…
Add table
Reference in a new issue