1
0
Fork 0

autopep8 --in-place --max-line-length=420 --recursive .

This commit is contained in:
Mateusz Konieczny 2024-08-14 10:17:26 +02:00
parent 9e9f5af2b0
commit 6e5070861f
24 changed files with 284 additions and 163 deletions

File diff suppressed because one or more lines are too long

View file

@ -5,6 +5,7 @@ import shops
import time
config = __import__("0_config")
def main():
directory_path = config.cache_folder()
@ -22,10 +23,11 @@ def main():
print(region, "- list_shops - started")
start = time.time()
for entry in shops.osm.list_shops(region, directory_path):
pass # needed to trigger processing code
pass # needed to trigger processing code
print((time.time() - start) / 60, "minutes")
print(region, "- list_shops - completed")
processed.append(region)
if __name__ == "__main__":
main()

View file

@ -1,18 +1,20 @@
import rich
import osm_bot_abstraction_layer.util_download_file
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
import json
import os
import requests
config = __import__("0_config")
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
import osm_bot_abstraction_layer.util_download_file
import rich
config = __import__("0_config")
def processed_atp_codes():
for area_name, area_data in config.processing_plan().items():
if 'accepted' in area_data:
for atp_code in area_data['accepted']:
yield atp_code
def main():
response = requests.get("https://data.alltheplaces.xyz/runs/latest.json")
todos = json.loads(response.text)
@ -21,14 +23,15 @@ def main():
download_entire_atp_dataset(run_id)
look_through_entire_atp_dataset()
def do_not_remind_that_this_tagging_may_be_worth_supporting():
notified_about_tag = {
# should be fixed in ATP, if possible
# TODO: raise isses at https://github.com/alltheplaces/alltheplaces/issues
'tourism': ['yes', 'attraction'],
'healthcare': [
'laboratory', # https://github.com/alltheplaces/alltheplaces/issues/8637
'centre', # not reported yet TODO
'laboratory', # https://github.com/alltheplaces/alltheplaces/issues/8637
'centre', # not reported yet TODO
],
# TODO maybe start including them?
@ -48,11 +51,11 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting():
if key not in notified_about_tag:
notified_about_tag[key] = []
#missing shoplike
# missing shoplike
notified_about_tag['office'].append('yes')
notified_about_tag['amenity'].append('canteen')
#kind also shoplike? I want to support them
# kind also shoplike? I want to support them
notified_about_tag['man_made'].append('charge_point')
notified_about_tag['amenity'].append('music_venue')
notified_about_tag['amenity'].append('prep_school')
@ -117,9 +120,10 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting():
# seem hard to confirm by survey
notified_about_tag['craft'].append('brewery')
notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office
notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office
return notified_about_tag
def warn_about_broken_spider(atp_code, message):
print()
print()
@ -130,24 +134,27 @@ def warn_about_broken_spider(atp_code, message):
print(url_log)
print()
def maybe_warn_about_spider_with_empty_file(atp_code):
if config.is_empty_file_for_spider_worth_mentioning(atp_code):
warn_about_broken_spider(atp_code, "empty output")
def maybe_warn_about_spider_with_broken_file(atp_code):
if config.is_broken_file_for_spider_worth_mentioning(atp_code):
warn_about_broken_spider(atp_code, "broken output file")
def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path):
notified_about_tag = do_not_remind_that_this_tagging_may_be_worth_supporting()
for key in tag_knowledge.typical_main_keys():
if key in entry['properties']:
if ";" in entry['properties'][key]:
continue # TODO - is it safe to consider it as being unfinished?
continue # TODO - is it safe to consider it as being unfinished?
if key == 'healthcare' and entry['properties'].get("amenity") == entry['properties'].get("healthcare"):
continue
if tag_knowledge.is_shoplike({key: entry['properties'][key]}) == True:
break # handles cases where healthcare is extra tag in addition to proper amenity
break # handles cases where healthcare is extra tag in addition to proper amenity
if tag_knowledge.is_shoplike({key: entry['properties'][key]}) == False:
if key not in notified_about_tag:
notified_about_tag[key] = []
@ -159,6 +166,7 @@ def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path):
print()
notified_about_tag[key].append(entry['properties'][key])
def all_spider_codes_iterator():
directory_path_with_unpacked_spider_data = config.atp_unpacked_folder()
# TODO: Is there match between spider codes and their filenames?
@ -180,6 +188,7 @@ def download_entire_atp_dataset(run_id):
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename)
os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"')
def look_through_entire_atp_dataset():
candidates = {}
for _area_name, area_data in config.processing_plan().items():
@ -200,10 +209,10 @@ def look_through_entire_atp_dataset():
continue
warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path)
if atp_code in [
'hyatt', # https://github.com/alltheplaces/alltheplaces/issues/9399
'maserati', # has many actually empty entries
'skoda', # limitations of source data, unfixable by ATP
'general_logistics_systems_de', # missing data to provide POI data, see https://github.com/alltheplaces/alltheplaces/commit/89f5511bacf24f2d6d0a1c2a183130c9148f772a
'hyatt', # https://github.com/alltheplaces/alltheplaces/issues/9399
'maserati', # has many actually empty entries
'skoda', # limitations of source data, unfixable by ATP
'general_logistics_systems_de', # missing data to provide POI data, see https://github.com/alltheplaces/alltheplaces/commit/89f5511bacf24f2d6d0a1c2a183130c9148f772a
]:
break
if config.canonical_feature(entry['properties']) == "?":
@ -226,11 +235,12 @@ def look_through_entire_atp_dataset():
print()
print("candidate")
print(item_path)
if atp_code not in candidates[area_data['country_code']]: # applies when one spider has many main keys
if atp_code not in candidates[area_data['country_code']]: # applies when one spider has many main keys
candidates[area_data['country_code']].append(atp_code)
print(key, "=", entry['properties'][key])
print(candidates)
def download(code, run_id):
script_location = os.path.abspath(__file__)
directory_path = config.cache_folder()
@ -238,5 +248,6 @@ def download(code, run_id):
filename = code + ".atp.geojson"
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename)
if __name__ == "__main__":
main()

View file

@ -52,6 +52,7 @@ def is_matching_any_name_part_to_osm_tags(name_part_list, osm_tags):
return True
return False
def matching_name_part(part, namelike_value):
part = part.lower()
namelike_value = namelike_value.lower()
@ -63,6 +64,7 @@ def matching_name_part(part, namelike_value):
else:
return True
def filter_with_fuzzy_name_match(osm_data, name_part_list):
returned = []
for osm in osm_data:
@ -70,6 +72,7 @@ def filter_with_fuzzy_name_match(osm_data, name_part_list):
returned.append(osm)
return returned
def get_filter_names_from_atp_dataset(current_atp):
filter_names = []
for atp in current_atp:
@ -79,12 +82,13 @@ def get_filter_names_from_atp_dataset(current_atp):
name_sources.append(short_name)
for name in name_sources:
for part in name.split():
if part.lower() in ["kebab", "kebap", "apteka", "cukiernia", "pizzeria", "na"]: # common and shared - automate detection?
if part.lower() in ["kebab", "kebap", "apteka", "cukiernia", "pizzeria", "na"]: # common and shared - automate detection?
continue
if part not in filter_names:
filter_names.append(part)
return filter_names
def run_match(osm_data, atp_code):
output_file = "build_temporary_files/" + atp_code + '.csv'
atp_data = load_atp(atp_code)
@ -93,11 +97,12 @@ def run_match(osm_data, atp_code):
matches = get_matches(osm_data, atp_data)
serializing.save_list_of_matches_to_csv(output_file, matches)
def get_matches(osm_data, atp_data):
match_list = []
filter_names = get_filter_names_from_atp_dataset(atp_data)
#TODO: get also misspellings
#TODO: handle nearby objects with matching feature type or vacant ones
# TODO: get also misspellings
# TODO: handle nearby objects with matching feature type or vacant ones
filtered_osm = filter_with_fuzzy_name_match(osm_data, filter_names)
osm_index = spatial_index.SpatialIndex(filtered_osm)
print("filtering", len(osm_data), "to", len(filtered_osm), "candidates based on names is done, now checking", len(atp_data), "ATP candidates by distance")
@ -145,6 +150,7 @@ def get_matches(osm_data, atp_data):
raise
return match_list
def passed_filter(osm_data_tag_filter, tags):
for key in osm_data_tag_filter.keys():
if osm_data_tag_filter[key] == None:
@ -173,6 +179,7 @@ def load_atp(atp_code):
# no need to report also here, so lets fail silently
return []
def load_atp_from_json(data, atp_code):
returned = []
for entry in data['features']:

View file

@ -18,6 +18,7 @@ import distance_distribution
config = __import__("0_config")
def get_free_space_in_mb(path):
total, used, free = shutil.disk_usage(path)
return free / 1024 / 1024
@ -54,6 +55,7 @@ def generate_report(cache_only):
copy_data_for_publication(all_atp_codes)
publish_data_on_internet()
def generate_bot_edit_list_page():
with open("output/bot_edit_plan_add_tags.html", 'w') as outfile:
outfile.write(html_bot_edit_prefix())
@ -75,6 +77,7 @@ def generate_bot_edit_list_page():
outfile.write('<li><a target="_blank" href="' + osm_link + '">' + osm_link + '</a>' + " <code>" + escape_html(key) + "=" + value + "</code></li>\n")
outfile.write(html_bot_edit_suffix())
def contact_method():
return """Please <a target="_blank" href="https://codeberg.org/matkoniecz/improving_openstreetmap_using_alltheplaces_dataset/issues">create an issue</a> or <a target="_blank" href="https://www.openstreetmap.org/message/new/Mateusz%20Konieczny">send me an OSM private message</a> if you see a potential for improvements. If potential improvements are in All The Places - better to create PR or issue there. If unsure, please write to me. If you see this data being misued and causing harm (for example, imported without consulting community or ignoring their feedback) - please write to me and I will help with cleanups, including reverts and reconsider how this data is publish."""
@ -109,6 +112,7 @@ def html_bot_edit_prefix():
<h2>Edit ideas listing</h2>
"""
def html_bot_edit_suffix():
return """</section>
</body>
@ -121,12 +125,13 @@ def clear_output_files(folder):
if os.path.isfile(file_path):
os.remove(file_path)
def produce_map_analysis_for_atp_code(atp_code, cache_only, url_checker_instance, report_generators):
csv_filepath = "build_temporary_files/" + atp_code + '.csv'
if os.path.isfile(csv_filepath) == False:
report_generators[atp_code] = {
'atp_file_is_broken': True
}
}
return report_generators
match_list = serializing.load_list_of_matches_from_csv(csv_filepath)
@ -142,12 +147,13 @@ def produce_map_analysis_for_atp_code(atp_code, cache_only, url_checker_instance
processed = qa.remove_bad_data(entry.atp_tags, atp_code)
if processed == None:
continue
entry.atp_tags = processed # TODO is it happening as data was passed to qa function
entry.atp_tags = processed # TODO is it happening as data was passed to qa function
rebuild_match_list.append(entry)
report_generators[atp_code] = produce_map_analysis_for_atp_data(atp_code, area_name="", match_list=rebuild_match_list, cache_only=cache_only, url_checker_instance=url_checker_instance)
return report_generators
class MismatchingNameReportCreator:
def __init__(self, atp_code, area_name):
self.atp_code = atp_code
@ -227,9 +233,9 @@ class MismatchingNameReportCreator:
def table_of_contents(self):
return [
{
'header': "Name mismatch",
'section_link': section_link('name mismatch between OSM and ATP', len(self.completely_mismatching_names), self.report_filename()),
'output_files': [self.report_filename()]
'header': "Name mismatch",
'section_link': section_link('name mismatch between OSM and ATP', len(self.completely_mismatching_names), self.report_filename()),
'output_files': [self.report_filename()]
},
]
@ -247,6 +253,7 @@ class MismatchingNameReportCreator:
#outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], missing.osm_match_center['lat'], missing.osm_match_center['lon'], color = 'blue'))
outfile.write(leafleter.generator.get_html_page_suffix())
class ATPGivesTagsReportCreator:
def __init__(self, url_checker_instance, atp_code, area_name):
self.atp_code = atp_code
@ -281,7 +288,7 @@ class ATPGivesTagsReportCreator:
writer = csv.writer(outfile)
for key in self.importable_keys:
if self.count_of_total_tag_mismatches[key] == 0:
for entry in self.shops_with_tags_to_be_added: # already Nominatim-filtered
for entry in self.shops_with_tags_to_be_added: # already Nominatim-filtered
if key in entry['tags_to_be_added']:
value = entry['entry'].atp_tags[key]
atp_code = entry['entry'].atp_tags['@spider']
@ -344,7 +351,7 @@ class ATPGivesTagsReportCreator:
return False
returned = self.url_checker.is_website_eligible(atp, cache_only)
if returned == None:
return False # not cached, instructed to use only cache
return False # not cached, instructed to use only cache
if returned:
if atp.atp_tags.get(tested_key) != atp.osm_match_tags.get(tested_key):
return True
@ -356,7 +363,7 @@ class ATPGivesTagsReportCreator:
# pointing to the main brand page
pass
elif self.url_checker.is_difference_limited_to_slash_at_end(atp_value, osm_value):
pass # effectively the same anyway, no real mismatch
pass # effectively the same anyway, no real mismatch
else:
self.mismatching_website_tags.append(atp)
self.report_mismatch(atp, tested_key)
@ -402,7 +409,7 @@ class ATPGivesTagsReportCreator:
new_tags[key] = entry['entry'].atp_tags[key]
message += tag_list_to_html(new_tags)
outfile.write(leafleter.generator.get_marker(message, entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], color='green'))
outfile.write(leafleter.generator.get_line(entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], entry['entry'].osm_match_center['lat'], entry['entry'].osm_match_center['lon'], color = 'green'))
outfile.write(leafleter.generator.get_line(entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], entry['entry'].osm_match_center['lat'], entry['entry'].osm_match_center['lon'], color='green'))
outfile.write(leafleter.generator.get_html_page_suffix())
def generate_mismatching_website_listing(self):
@ -422,9 +429,10 @@ class ATPGivesTagsReportCreator:
summary += 'tag list as suggested by ATP (should not be assumed to be directly usable in OSM):<br><br>'
summary += tag_list_to_html(bad.atp_tags)
outfile.write(leafleter.generator.get_marker(summary, bad.atp_center['lat'], bad.atp_center['lon'], color='red'))
outfile.write(leafleter.generator.get_line(bad.atp_center['lat'], bad.atp_center['lon'], bad.osm_match_center['lat'], bad.osm_match_center['lon'], color = 'red'))
outfile.write(leafleter.generator.get_line(bad.atp_center['lat'], bad.atp_center['lon'], bad.osm_match_center['lat'], bad.osm_match_center['lon'], color='red'))
outfile.write(leafleter.generator.get_html_page_suffix())
class MissingObjectsReportCreator:
def __init__(self, atp_code, area_name):
self.area_name = area_name
@ -488,6 +496,7 @@ class MissingObjectsReportCreator:
bad_tags_skipped.append(atp)
json.dump(serializing.generate_geojson_structure(bad_tags_skipped), f)
class NominatimMismatchReportCreator:
def __init__(self, atp_code, area_name):
self.area_name = area_name
@ -546,9 +555,10 @@ class NominatimMismatchReportCreator:
summary += tag_list_to_html(missing.atp_tags)
outfile.write(leafleter.generator.get_marker(summary, missing.atp_center['lat'], missing.atp_center['lon'], color='red'))
location_from_nominatim = nominatim.location_given_tags(missing.atp_tags, debug_identifier=self.atp_code)[0]
outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], location_from_nominatim['lat'], location_from_nominatim['lon'], color = 'red'))
outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], location_from_nominatim['lat'], location_from_nominatim['lon'], color='red'))
outfile.write(leafleter.generator.get_html_page_suffix())
"""
class MismatchingNameReportCreator:
def __init__(self, atp_code):
@ -577,6 +587,8 @@ add to test_display_website
"""
# TODO: passing atp_code should not be needed
# TODO: save files one level higher, here just produce analysis
def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_only, url_checker_instance):
missing_objects_report = MissingObjectsReportCreator(atp_code, area_name)
mismatching_name_report = MismatchingNameReportCreator(atp_code, area_name)
@ -613,7 +625,7 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl
conflict_between_atp_and_nominatim_report.register_case_using_known_nominatim_status(atp, status)
elif atp.match_distance == None or atp.match_distance > config.missing_shop_distance_in_kilometers_for_specific_case(atp.atp_tags):
nominatim_match = nominatim.is_location_matching_tags(atp.atp_tags, atp.atp_center, cache_only=cache_only, spider=atp_code)
if nominatim_match != False: # both matches, failed geolocation and geolocation not done at all go here
if nominatim_match != False: # both matches, failed geolocation and geolocation not done at all go here
missing_objects_report.check_case(atp)
conflict_between_atp_and_nominatim_report.register_case_using_known_nominatim_status(atp, nominatim_match)
else:
@ -635,7 +647,8 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl
'missing_objects_report': missing_objects_report,
'conflict_between_atp_and_nominatim_report': conflict_between_atp_and_nominatim_report,
'total_atp_entries': len(match_list),
}
}
def format_for_geojson_export(dataset):
for entry in dataset:
@ -644,6 +657,7 @@ def format_for_geojson_export(dataset):
del atp.atp_tags[key]
return dataset
def get_center(dataset):
max_lat = -90
max_lon = -180
@ -671,6 +685,7 @@ def sidebar_content(page_specific_info, atp_code):
sidebar += '<br><br>\n<a href="https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/' + atp_code + '.py" target="_blank">atp source code</a>'
return sidebar
def tag_list_to_html(tags):
returned = ""
normal_tags = ""
@ -691,21 +706,26 @@ def tag_list_to_html(tags):
returned += "<br><br>tags present in ATP, very likely not usable directly in OSM<br>" + dropped_tags
return returned
def htmlify_key_value_pair(key, value):
return key + " = " + htmlify_value(key, value) + "<br>"
def htmlify_value(key, value):
value = escape_html(value)
if key == "website" or (key == "image" and value.find("http") == 0):
value = '<a href="' + value + '">' + value + "</a>"
return value
def escape_url(value):
return str(value).replace('"', '%22').replace("'", "%27")
def escape_html(value):
return html.escape(value).replace("\r\n", "<br>").replace("\n", "<br>")
def headers():
# TODO: pass it smarter in config (list of main report creators?)
# or at least make it static method
@ -718,6 +738,7 @@ def headers():
NominatimMismatchReportCreator('dummy', 'dummy area name').table_of_contents()[0]['header'],
]
def generate_website_index_listing_by_country(report_generators, released_codes_by_region, partial=False):
with open("output/index.html", 'w') as outfile:
outfile.write(html_prefix())
@ -732,6 +753,7 @@ def generate_website_index_listing_by_country(report_generators, released_codes_
outfile.write(table_with_spider_overview(atp_codes, report_generators, partial))
outfile.write(html_suffix())
def generate_website_index_for_named_area(report_generators, area_name, partial=False):
with open("output/" + area_name + "_index.html", 'w') as outfile:
outfile.write(html_prefix())
@ -739,6 +761,7 @@ def generate_website_index_for_named_area(report_generators, area_name, partial=
outfile.write(table_with_spider_overview(report_generators.keys(), report_generators, partial))
outfile.write(html_suffix())
def table_with_spider_overview(atp_codes, report_generators, partial):
returned = ""
returned += '<table class="statistics-summary"><thead><tr><th>' + '</th><th>'.join(headers()) + '</th></tr></thead>\n'
@ -756,8 +779,9 @@ def table_with_spider_overview(atp_codes, report_generators, partial):
returned += "no entries shown in this area\n"
return returned
def table_row(atp_code, statistics):
if statistics['missing_objects_report'] == None: #TODO test is it working
if statistics['missing_objects_report'] == None: # TODO test is it working
return '<tr><th></th><td colspan="5">Data missing</td></tr>'
missing_section = statistics['missing_objects_report'].table_of_contents()[0]['section_link']
@ -772,11 +796,13 @@ def table_row(atp_code, statistics):
return '<tr><th>' + atp_code + '</th><td>' + missing_section + '</td><td>' + mismatching_names_section + '</td><td>' + tags_section + '</td><td>' + website_mismatch_section + '</td><td>' + mismatch_section + not_attempted + '</td></tr>'
def section_link(description, count, page):
if count == 0:
return '<span class=less-visible title="' + description + '">' + str(count) + '</span>'
return '<a href="' + page + '" title="' + description + '">' + str(count) + '</a>'
def contact_method():
return """Please <a href="https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data/issues">create an issue</a> or <a href="https://www.openstreetmap.org/message/new/Mateusz%20Konieczny">send me an OSM private message</a> if you see a potential for improvements. If potential improvements are in All The Places - better to create PR or issue <a href="https://github.com/alltheplaces/alltheplaces">there</a>. If unsure, please write to me."""
@ -819,11 +845,13 @@ def html_prefix():
</p>
"""
def html_suffix():
return """<hr><br>Published on <a href="https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/">https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/</a> - generated on """ + f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S%z}' + """ (note that ATP and OSM data used here may be older) </section>
</body>
</html>"""
def iterate_over_output_files(atp_code):
reports = [
MissingObjectsReportCreator(atp_code, 'dummy area name'),
@ -836,6 +864,7 @@ def iterate_over_output_files(atp_code):
for file in entry['output_files']:
yield file
def copy_data_for_publication(all_atp_codes):
for atp_code in all_atp_codes:
if get_free_space_in_mb('../public_website_with_output') < 400:
@ -850,8 +879,10 @@ def copy_data_for_publication(all_atp_codes):
os.system("cp output/index.html ../public_website_with_output/index.html")
# published on https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/
def publish_data_on_internet():
os.system('cd ../public_website_with_output && git add . && git commit -m "automatic update" && git push')
if __name__ == "__main__":
main()

View file

@ -10,20 +10,25 @@ import shared
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
def is_in_this_area(area, atp):
if atp.atp_center['lat'] > area['min_lat'] and atp.atp_center['lat'] < area['max_lat']:
if atp.atp_center['lon'] > area['min_lon'] and atp.atp_center['lon'] < area['max_lon']:
return True
return False
def areas():
return {
'kraków': {'min_lat': 50, 'min_lon': 19.5, 'max_lat': 50.3, 'max_lon': 20.5},
# http://bboxfinder.com/#52.383301,16.885986,52.436182,17.044859
'poznań': {'min_lat': 52.383301, 'min_lon': 16.885986, 'max_lat': 52.436182, 'max_lon': 17.044859},
}
}
general_statistics = {}
def main():
for atp_code, _item_path in obtain_atp_data.all_spider_codes_iterator():
print(atp_code)
@ -55,6 +60,7 @@ def save_files(data, name):
with open(name + '_missing.kml', 'w') as f:
f.write(serializing.generate_kml_text(data))
def clear_output_files(folder):
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
@ -71,4 +77,5 @@ def generate_missing_shop_listing(atp_code, apparently_missing_shops):
osm_location_link = shared.link_to_point_in_osm(missing.atp_center['lat'], missing.atp_center['lon'])
summary = 'here ATP shows object being present, which seems not mapped in OpenStreetMap (<a href="' + osm_location_link + '">location</a>):<br><br>'
main()

View file

@ -23,11 +23,13 @@ def graticule_id(lat, lon, lat_span, lon_span, margin_in_kilometers):
# filter data for each
# filter data for each in constant time (just check is given location within graticule range)
def main():
check_is_any_graticule_having_margin_greater_than_entire_graticule()
generate_test_graticule_coverage_map()
test_area_run()
def generate_test_graticule_coverage_map():
graticule_anchor_coverage = {'min_lat': 49, 'min_lon': 14, 'max_lat': 54, 'max_lon': 24}
with open("test_coverage_graticule_display.html", 'w') as outfile:
@ -37,9 +39,10 @@ def generate_test_graticule_coverage_map():
for lat_anchor in range(graticule_anchor_coverage['min_lat'], graticule_anchor_coverage['max_lat'] + 1):
for lon_anchor in range(graticule_anchor_coverage['min_lon'], graticule_anchor_coverage['max_lon'] + 1):
shape = [[lat_anchor + 1, lon_anchor + 1], [lat_anchor + 1, lon_anchor], [lat_anchor, lon_anchor], [lat_anchor, lon_anchor + 1], [lat_anchor + 1, lon_anchor + 1]]
outfile.write(leafleter.generator.get_polygon(shape, color = "green", fill_color = "green", link = "https://pl.wikipedia.org/wiki/Pozna%C5%84"))
outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link="https://pl.wikipedia.org/wiki/Pozna%C5%84"))
outfile.write(leafleter.generator.get_html_page_suffix())
def test_area_run():
# http://bboxfinder.com/#52.383301,16.885986,52.436182,17.044859
poznań = {'min_lat': 52.383301, 'min_lon': 16.885986, 'max_lat': 52.436182, 'max_lon': 17.044859, 'name': 'Poznań'}
@ -57,7 +60,7 @@ def test_area_run():
outfile.write(leafleter.generator.get_html_page_prefix("website title", (area['max_lat'] + area['min_lat'])/2, (area['max_lon'] + area['min_lon'])/2))
#outfile.write(leafleter.generator.get_marker("text", 50.06, 19.93))
shape = [[area['max_lat'], area['max_lon']], [area['max_lat'], area['min_lon']], [area['min_lat'], area['min_lon']], [area['min_lat'], area['max_lon']], [area['max_lat'], area['max_lon']]]
outfile.write(leafleter.generator.get_polygon(shape, color = "green", fill_color = "green", link = "https://pl.wikipedia.org/wiki/Pozna%C5%84"))
outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link="https://pl.wikipedia.org/wiki/Pozna%C5%84"))
outfile.write(leafleter.generator.get_html_page_suffix())
atp_data_by_spider = {}
@ -72,13 +75,12 @@ def test_area_run():
if len(gathered) > 0:
atp_data_by_spider[atp_code] = gathered
general_area = "europe/poland"
osm_data = []
for entry in matcher.load_geofabrik(general_area, config.cache_folder()):
if entry['center']['lat'] > area['min_lat'] and entry['center']['lat'] < area['max_lat']:
if entry['center']['lon'] > area['min_lon'] and entry['center']['lon'] < area['max_lon']:
osm_data.append(entry)
if entry['center']['lat'] > area['min_lat'] and entry['center']['lat'] < area['max_lat']:
if entry['center']['lon'] > area['min_lon'] and entry['center']['lon'] < area['max_lon']:
osm_data.append(entry)
print(len(atp_data_by_spider))
print(len(osm_data))
@ -100,6 +102,7 @@ def test_area_run():
print(output_file)
def check_is_any_graticule_having_margin_greater_than_entire_graticule():
for lat in range(-89, 89):
for lon in range(-180, 180):
@ -110,17 +113,17 @@ def check_is_any_graticule_having_margin_greater_than_entire_graticule():
distance_for_lat_degree_alt = shared.calculate_distance(
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1},
{'lat': tested_location['lat'] + 2, 'lon': tested_location['lon'] + 1}
)
)
distance_for_lat_degree_alt_alt = shared.calculate_distance(
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon']},
{'lat': tested_location['lat'] + 2, 'lon': tested_location['lon']}
)
)
print("expected zero, maybe espilon changes", distance_for_lat_degree_alt - distance_for_lat_degree)
print("expected zero, maybe espilon changes", distance_for_lat_degree_alt_alt - distance_for_lat_degree)
distance_for_lon_degree_alt = shared.calculate_distance(
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1},
{'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1}
)
)
print("expected meaningful changes", distance_for_lon_degree_alt - distance_for_lon_degree)
margin_in_kilometers = config.maximum_missing_shop_distance_in_kilometers()
@ -133,4 +136,5 @@ def check_is_any_graticule_having_margin_greater_than_entire_graticule():
raise
break
main()

View file

@ -33,7 +33,7 @@ def main():
except FileNotFoundError as e:
print(e)
pass
#TODO skip freeform/valid ones
# TODO skip freeform/valid ones
for key, values in used_tags.items():
if tag_knowledge.is_freeform_key(key):
print(key, "=", "*")

View file

@ -1,6 +1,7 @@
import matplotlib.pyplot as plt
import os
class MatchDistanceDestributionReportCreator:
def __init__(self, identifier, area_name):
self.identifier = identifier
@ -29,7 +30,7 @@ class MatchDistanceDestributionReportCreator:
plt.rcParams["figure.figsize"] = [10, 10]
# https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
# see 02 file for more investigation
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
plt.grid(True)
plt.clf()
plt.xlim(0, 1200)
@ -39,7 +40,7 @@ class MatchDistanceDestributionReportCreator:
plt.rcParams["figure.figsize"] = [10, 10]
# https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html
# see 02 file for more investigation
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice
plt.grid(True)
plt.clf()
plt.xlim(0, 300)

View file

@ -4,6 +4,7 @@ import datetime
import time
config = __import__("0_config")
def scan_eligible(grab_bag, scanner):
while True:
any_scanned = False
@ -18,6 +19,7 @@ def scan_eligible(grab_bag, scanner):
if any_scanned == False:
return
def main():
wait_between_the_same_domain_minutes = 5
grab_bag = {}
@ -38,5 +40,6 @@ def main():
scan_eligible(grab_bag, scanner)
time.sleep(10)
if __name__ == "__main__":
main()

View file

@ -6,24 +6,27 @@ import re
import shutil
config = __import__("0_config")
def cache_path():
return 'nominatim_cache'
# Initialize disk cache
nominatim_cache = diskcache.Cache(cache_path())
def drop_extra_detail_blocking_nominatim(value):
# patch nominatim bug where inclusion of apartment code breaks search
# https://github.com/osm-search/Nominatim/issues/145#issuecomment-2143549199
# see https://pythex.org/ for testing
value = re.sub(r'/\d+([a-zA-Z])?', '', value) # turns 178/12 into 178
value = re.sub(r'(,|, |)lok\..*', '', value, flags=re.IGNORECASE) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)LOK .*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)lokal .*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)lok .*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)lok.*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)LU.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
value = re.sub(r'(,|, |)Lu.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
value = re.sub(r'/\d+([a-zA-Z])?', '', value) # turns 178/12 into 178
value = re.sub(r'(,|, |)lok\..*', '', value, flags=re.IGNORECASE) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)LOK .*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)lokal .*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)lok .*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)lok.*', '', value) # "lokal" is Polish for "unit"
value = re.sub(r'(,|, |)LU.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
value = re.sub(r'(,|, |)Lu.*', '', value) # "lokal użytkowy" is Polish legalese for "unit"
value = re.sub(r'(,|, |)suite .*', '', value, flags=re.IGNORECASE)
@ -34,6 +37,7 @@ def drop_extra_detail_blocking_nominatim(value):
value = re.sub(r'(,|, |)unit .*', '', value, flags=re.IGNORECASE)
return value
def nominatim_queries(tags, debug=False):
address_tag_groups = [
['addr:country', 'addr:city', 'addr:street', 'addr:housenumber'],
@ -67,7 +71,7 @@ def nominatim_queries(tags, debug=False):
if key in ["addr:street_address", 'addr:street', 'addr:full']:
# see https://github.com/osm-search/Nominatim/issues/87
value = re.sub(r'ul\. ?', '', value, flags=re.IGNORECASE)
value = re.sub(r'( |$)ul ', ' ', value, flags=re.IGNORECASE) # "ul Żabia"
value = re.sub(r'( |$)ul ', ' ', value, flags=re.IGNORECASE) # "ul Żabia"
if key in ["addr:street_address", 'addr:full']:
value = drop_extra_detail_blocking_nominatim(value)
query += value
@ -76,6 +80,7 @@ def nominatim_queries(tags, debug=False):
print(group)
yield query
def location_given_tags_cache_only(tags):
"""
True: matches
@ -87,21 +92,22 @@ def location_given_tags_cache_only(tags):
if query not in nominatim_cache:
with open(config.nominatim_requests_missing_from_cache(), 'a') as outfile:
outfile.write(query+"\n")
return -1 # maybe transformed query would give better result?
# should not check further ones
return -1 # maybe transformed query would give better result?
# should not check further ones
else:
response = nominatim_cache[query]
if len(response) >= 1:
return response
return None
def location_given_tags(tags, debug_identifier):
for query in nominatim_queries(tags):
response = query_nominatim(query)
if len(response) >= 1:
return response
atp_code = debug_identifier # TODO handle this
atp_code = debug_identifier # TODO handle this
if config.is_failed_geocoding_worth_mentioning(atp_code):
print()
print()
@ -119,6 +125,7 @@ def location_given_tags(tags, debug_identifier):
print()
return None
def is_location_matching_tags(tags, center, spider, cache_only=False):
"""
True: matches
@ -135,6 +142,7 @@ def is_location_matching_tags(tags, center, spider, cache_only=False):
return response
return are_locations_matching(tags, response[0], center)
def are_locations_matching(tags, location, center):
distance = shared.calculate_distance(center, location)
if distance > config.missing_shop_distance_in_kilometers_for_specific_case(tags):
@ -142,10 +150,12 @@ def are_locations_matching(tags, location, center):
else:
return True
def get_free_space_in_mb(path):
total, used, free = shutil.disk_usage(path)
return free / 1024 / 1024
def query_nominatim(query):
# Check if the response is in the cache
if query in nominatim_cache:
@ -221,6 +231,7 @@ def query_nominatim(query):
else:
response.raise_for_status()
# Example usage
# gptchat generated
if __name__ == '__main__':

View file

@ -1,10 +1,12 @@
import nominatim
config = __import__("0_config")
def main():
with open(config.nominatim_requests_missing_from_cache()) as fp:
for query in fp:
nominatim.query_nominatim(query.strip())
if __name__ == "__main__":
main()

36
qa.py
View file

@ -1,8 +1,9 @@
config = __import__("0_config")
import shops
import rich
import phonenumbers
import datetime
import phonenumbers
import rich
import shops
config = __import__("0_config")
def remove_bad_data(data, atp_code):
"""
@ -94,6 +95,7 @@ def remove_bad_data(data, atp_code):
del data[key]
return data
def is_empty_value(key, value, atp_code):
if value.lower() in ["undefined", "b/n", "---", "none", "n/a"]:
if config.is_null_specified_as_text_worth_mentioning(atp_code):
@ -114,11 +116,12 @@ def is_empty_value(key, value, atp_code):
return True
return False
def handle_ref_tag(data, atp_code):
if atp_code in ['paczkomat_inpost_pl', 'allegro_one_box_pl']:
return data # actual ref
return data # actual ref
elif atp_code in ['credit_agricole_pl']:
del data["ref"] # synthethic ref created by ATP
del data["ref"] # synthethic ref created by ATP
elif "ref" in data:
# https://github.com/alltheplaces/alltheplaces/blob/master/DATA_FORMAT.md describe `ref` and I am a bit confused
# > A unique identifier for this feature inside this spider. The code that generates the output will remove duplicates based on the value of this key.
@ -137,6 +140,7 @@ def handle_ref_tag(data, atp_code):
del data["ref"]
return data
def remove_bad_phone_data(data, atp_code):
if 'phone' in data:
if data['phone'].replace(" ", "").startswith("+443"):
@ -157,6 +161,7 @@ def remove_bad_phone_data(data, atp_code):
del data['phone']
return data
def is_valid_phone_tag(phone_tag):
if ";" not in phone_tag:
return is_valid_phone_number(phone_tag)
@ -166,13 +171,14 @@ def is_valid_phone_tag(phone_tag):
return False
return True
def is_valid_phone_number(phone):
if phone in [
'+4800000000', # https://github.com/alltheplaces/alltheplaces/issues/8633
]:
return False
try:
parsed = phonenumbers.parse(phone, None)
return phonenumbers.is_valid_number(parsed)
except phonenumbers.phonenumberutil.NumberParseException:
return False
if phone in [
'+4800000000', # https://github.com/alltheplaces/alltheplaces/issues/8633
]:
return False
try:
parsed = phonenumbers.parse(phone, None)
return phonenumbers.is_valid_number(parsed)
except phonenumbers.phonenumberutil.NumberParseException:
return False

2
run.py
View file

@ -5,6 +5,7 @@ obtain_atp_data = __import__("2_obtain_atp_data")
matcher = __import__("3_matcher")
show_data = __import__("4_show_data")
def main():
# TODO: test dependencies on fresh OS
# see readme for instructions how to install dependencies
@ -27,6 +28,7 @@ def main():
# maps listing various missing data - shops, tags, and of various wrong data (shop in OSM not in ATP and so on)
show_data.main()
if __name__ == "__main__":
main()

View file

@ -2,6 +2,7 @@ import base64
import json
import csv
class Match:
def __init__(self, atp_center, atp_tags, osm_match_center, osm_match_tags, osm_link, match_distance, all_very_good_matches):
self.atp_center = atp_center
@ -11,16 +12,18 @@ class Match:
self.osm_link = osm_link
self.match_distance = match_distance
self.all_very_good_matches = all_very_good_matches
def __str__(self):
return "Match(" + str(self.atp_center) + ',' + str(self.atp_tags) + ',' + str(self.osm_match_center) + ',' + str(self.osm_match_tags) + ',' + str(self.osm_link) + ',' + str(self.match_distance) + ',' + str(self.all_very_good_matches) + ")"
def save_list_of_matches_to_csv(filepath, data):
with open(filepath, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['atp_lat', 'atp_lon', 'atp_tags_dict_in_base64', 'osm_lat', 'osm_lon', 'osm_tags_dict_in_base64', 'osm_link', 'match_distance', 'all_very_good_matches'])
for entry in data:
if entry.match_distance == None:
writer.writerow([entry.atp_center['lat'], entry.atp_center['lon'], encode_to_base64_via_json(entry.atp_tags),"","","","","", ""])
writer.writerow([entry.atp_center['lat'], entry.atp_center['lon'], encode_to_base64_via_json(entry.atp_tags), "", "", "", "", "", ""])
else:
writer.writerow([
entry.atp_center['lat'],
@ -32,7 +35,8 @@ def save_list_of_matches_to_csv(filepath, data):
entry.osm_link,
entry.match_distance,
encode_to_base64_via_json(entry.all_very_good_matches)
])
])
def load_list_of_matches_from_csv(filepath):
try:
@ -52,7 +56,7 @@ def load_list_of_matches_from_csv(filepath):
osm_match_center = {'lat': float(row[3]), 'lon': float(row[4])}
osm_match_tags = decode_from_base64_via_json(row[5])
for key, value in osm_match_tags.items():
osm_match_tags[key] = str(value) # TODO - review saving code, this should not be needed
osm_match_tags[key] = str(value) # TODO - review saving code, this should not be needed
osm_link = row[6]
match_distance = float(row[7])
all_very_good_matches = decode_from_base64_via_json(row[8])
@ -62,7 +66,9 @@ def load_list_of_matches_from_csv(filepath):
print(filepath)
raise
#gptchat generated
# gptchat generated
def encode_to_base64_via_json(input_dict):
# Convert the dictionary to a JSON string
json_str = json.dumps(input_dict)
@ -74,7 +80,9 @@ def encode_to_base64_via_json(input_dict):
base64_str = base64_bytes.decode('utf-8')
return base64_str
#gptchat generated
# gptchat generated
def decode_from_base64_via_json(base64_str):
# Decode the Base64 string to bytes
base64_bytes = base64_str.encode('utf-8')
@ -86,18 +94,20 @@ def decode_from_base64_via_json(base64_str):
output_dict = json.loads(json_str)
return output_dict
def generate_geojson_structure(dataset):
geojson_data = {"type": "FeatureCollection","features": []}
geojson_data = {"type": "FeatureCollection", "features": []}
for atp in dataset:
geojson_data['features'].append({"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [atp.atp_center['lon'], atp.atp_center['lat']]
},
"properties": atp.atp_tags
})
"geometry": {
"type": "Point",
"coordinates": [atp.atp_center['lon'], atp.atp_center['lat']]
},
"properties": atp.atp_tags
})
return geojson_data
def generate_kml_text(dataset):
geojson_data = generate_geojson_structure(dataset)
returned = """<?xml version="1.0" encoding="UTF-8"?>

View file

@ -1,8 +1,10 @@
import geopy.distance
def link_to_point_in_osm(lat, lon):
return 'https://www.openstreetmap.org/?mlat=' + str(lat) + "&mlon=" + str(lon) + "#map=19/" + str(lat) + '/' + str(lon)
def calculate_distance(point_a, point_b):
# https://github.com/geopy/geopy?tab=readme-ov-file#measuring-distance
coords_1 = (point_a['lat'], point_a['lon'])

View file

@ -14,4 +14,3 @@ class SpatialIndex:
# sort by longitude
# select quickly by longitude, leaving unlimited for latitude

View file

@ -1,9 +1,10 @@
import distance_distribution
import url_checker
import leafleter
import serializing
import unittest
show_data = __import__("4_show_data")
import serializing
import leafleter
import url_checker
import distance_distribution
class IsCodeCompletelyCrashingSmoketests(unittest.TestCase):
def test_rough_code_validity(self):
@ -49,6 +50,7 @@ class IsCodeCompletelyCrashingSmoketests(unittest.TestCase):
for file in show_data.iterate_over_output_files('dummy_atp_code'):
pass
class TagListFormattingTests(unittest.TestCase):
def test_escaping_newlines(self):
self.assertEqual(show_data.escape_html("ajaj\naaaa"), "ajaj<br>aaaa")
@ -59,6 +61,7 @@ class TagListFormattingTests(unittest.TestCase):
def test_tag_list_generation_newline_in_tags_escape(self):
self.assertEqual("aaaa<br>bbb" in show_data.tag_list_to_html({"description": "aaaa\nbbb"}), True)
class PhoneSuggestingTests(unittest.TestCase):
def test_accept_normal_phone(self):
add_tags_from_atp = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy identifier for tests', 'dummy area name')
@ -88,6 +91,7 @@ class PhoneSuggestingTests(unittest.TestCase):
creator = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy_atp_code', 'dummy area name')
self.assertEqual(creator.is_phone_eligible(match), False)
class WebsiteSuggestingTests(unittest.TestCase):
def test_accept_normal_website(self):
add_tags_from_atp = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy identifier for tests', 'dummy area name')

View file

@ -3,6 +3,7 @@ import link_scan_worker
import run
import unittest
class SmokeTest(unittest.TestCase):
def test_math(self):
self.assertEqual(2+2, 4)

View file

@ -1,7 +1,8 @@
import serializing
import unittest
matcher = __import__("3_matcher")
config = __import__("0_config")
import serializing
class RealityTests(unittest.TestCase):
def test_match_on_exact_match(self):
@ -59,7 +60,6 @@ class RealityTests(unittest.TestCase):
matches = matcher.get_matches(osm_data, atp_data)
self.assertEqual(matches[0].match_distance, None)
def test_accept_matches_for_ice_cream_synonyms(self):
atp_data = [self.package_tags_into_mock({'brand': "Titan", 'amenity': 'ice_cream'})]
osm_data = [self.package_tags_into_mock({'brand': "Titan", 'shop': 'ice_cream'})]
@ -119,4 +119,4 @@ class RealityTests(unittest.TestCase):
matches = matcher.get_matches(osm_data, atp_data)
self.assertEqual(matches[0].match_distance, 0)
#TODO: how to handle shop=yes shop=vacant
# TODO: how to handle shop=yes shop=vacant

View file

@ -1,6 +1,7 @@
import unittest
import qa
class RealityTests(unittest.TestCase):
def test_mathworks(self):
self.assertEqual(2 + 1, 3)

View file

@ -1,6 +1,7 @@
import unittest
import spatial_index
class Tests(unittest.TestCase):
def test_basic_match_for_single_entry(self):
data = [
@ -96,7 +97,7 @@ class Tests(unittest.TestCase):
if entry["tags"] not in matches:
matches[entry["tags"]] = 0
matches[entry["tags"]] += 1
self.assertEqual(matches, {4: 1, 5:1, 6:1, 7:1})
self.assertEqual(matches, {4: 1, 5: 1, 6: 1, 7: 1})
def test_basic_match_for_all_entries_except_first(self):
data = [
@ -120,8 +121,7 @@ class Tests(unittest.TestCase):
if entry["tags"] not in matches:
matches[entry["tags"]] = 0
matches[entry["tags"]] += 1
self.assertEqual(matches, {4: 1, 5:1, 6:1, 7:1})
self.assertEqual(matches, {4: 1, 5: 1, 6: 1, 7: 1})
def test_basic_match_for_all_entries_except_last(self):
data = [
@ -144,4 +144,4 @@ class Tests(unittest.TestCase):
if entry["tags"] not in matches:
matches[entry["tags"]] = 0
matches[entry["tags"]] += 1
self.assertEqual(matches, {4: 1, 5:1, 6:1})
self.assertEqual(matches, {4: 1, 5: 1, 6: 1})

View file

@ -6,6 +6,7 @@ import shutil
import time
config = __import__("0_config")
class URLChecker():
def __init__(self):
"""
@ -15,7 +16,7 @@ class URLChecker():
that later should have been disposed but were not
"""
self.url_check_cache = diskcache.Cache(self.cache_path())
urllib3.disable_warnings() # silences complaints about unverified requests via HTTPS
urllib3.disable_warnings() # silences complaints about unverified requests via HTTPS
# this is done to ignore complaints about "verify=False" in requests.get
# this is not so terrible as I only check is website up
# see https://stackoverflow.com/questions/78855740/starfield-ca-not-recoggnised-by-requests-package
@ -89,7 +90,7 @@ class URLChecker():
# https://salony.orange.pl/pl/orange-jastrz%C4%99bie-zdr%C3%B3j-galeria-zdr%C3%B3j-26882
pass
elif self.is_difference_limited_to_slash_at_end(atp_value, atp_after_redirect):
pass # just adding trailing / is not worth raising an alarm... I think?
pass # just adding trailing / is not worth raising an alarm... I think?
else:
self.consider_logging_that_atp_link_redirects(tested_key, atp_value, atp)
return False
@ -110,13 +111,13 @@ class URLChecker():
if link_a[-1] == "/":
link_a = link_a[:-1]
if link_b[-1] == "/":
link_b =link_b[:-1]
link_b = link_b[:-1]
return link_a == link_b
def consider_logging_that_atp_link_was_rejected(self, tested_key, atp_value, atp):
if atp.atp_tags['@spider'] not in [
'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415
'true_value_us', # see above
'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415
'true_value_us', # see above
]:
pass
#do not log problems as long as above issues are not fixed
@ -125,8 +126,8 @@ class URLChecker():
def consider_logging_that_atp_link_redirects(self, tested_key, atp_value, atp):
if atp.atp_tags["@spider"] not in [
'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409
'bevmo_us', # https://github.com/alltheplaces/alltheplaces/issues/9493
'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409
'bevmo_us', # https://github.com/alltheplaces/alltheplaces/issues/9493
]:
pass
#do not log problems as long as above issues are not fixed
@ -290,7 +291,7 @@ class URLChecker():
'sobeys.ca',
'zambrero.com',
'zambrero.com.au'
]:
]:
# handles also broken such as
# website = ps://www.biedronka.pl
for protocol in ["", "http://", "https://", "ps://"]:
@ -302,7 +303,7 @@ class URLChecker():
'https://www.circlek.pl/wyszukaj-stacje',
'http://www.statoil.pl',
'Biedronka.PL',
'https://www.aldi-sued.de/de/homepage.html', # seems to be added by some ATP?
'https://www.aldi-sued.de/de/homepage.html', # seems to be added by some ATP?
'https://allegro.pl/kampania/one/znajdz-nas',
'https://allegro.pl/kampania/one',
'https://www.castorama.pl',
@ -386,11 +387,10 @@ class URLChecker():
if self.get_free_space_in_mb(self.cache_path()) < 400:
raise Exception("running out of free space on drive")
print(link, reason)
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
# NOTE: SSL verification checks are disabled
# to keep https://aviastacjapaliw.pl/stacje/avia-protasy/ working

View file

@ -1,6 +1,7 @@
import unittest
import url_checker
class LinkCheckingTests(unittest.TestCase):
def test_link_rejector_rejecting_known_bad(self):
test = url_checker.URLChecker()