1
0
Fork 0

let autopep8 do some pointless changes

OK, one or two was helpful

I rejected harmful ones and let pointless and OK to go through
This commit is contained in:
Mateusz Konieczny 2025-01-15 16:26:41 +01:00
parent 91590fb3c9
commit cc91eb1da2
24 changed files with 565 additions and 441 deletions

View file

@ -11,22 +11,27 @@ import rich
# see https://github.com/theskumar/python-dotenv
dotenv.load_dotenv()
def allow_extremely_low_priority_atp_logging():
# not an ideal metric, but avoids asking me to be active in ATP if ATP is bottlenecked
return len(known_broken_addr_full_spiders()) == 0 and allow_very_low_priority_atp_logging()
def allow_very_low_priority_atp_logging():
# not an ideal metric, but avoids asking me to be active in ATP if ATP is bottlenecked
return len(known_broken_addr_full_spiders()) < 5 and allow_low_priority_atp_logging()
def allow_low_priority_atp_logging():
# not an ideal metric, but avoids asking me to be active in ATP if ATP is bottlenecked
return len(known_broken_addr_full_spiders()) < 20 and allow_normal_priority_atp_logging()
def allow_normal_priority_atp_logging():
# not an ideal metric, but avoids asking me to be active in ATP if ATP is bottlenecked
return len(known_broken_addr_full_spiders()) < 30
def publish_geojson_when_generating_reports():
return False
@ -53,7 +58,7 @@ def is_nonlocal_phone_worth_mentioning(atp_code):
def is_bogus_key_worth_mentioning(key, atp_code):
if atp_code == 'atp_code':
return False # from tests, synthetic data, lets not spam in tests
return False # from tests, synthetic data, lets not spam in tests
if key in ["branch", "branch:en"]:
# this data is just ignored - lets not spend effort on improving it
return False
@ -85,101 +90,101 @@ def is_bogus_key_worth_mentioning(key, atp_code):
return False
return atp_code not in [
'credit_union_us', # https://github.com/alltheplaces/alltheplaces/issues/11185
'dominos_pizza_bh', # https://github.com/alltheplaces/alltheplaces/issues/11160
'dominos_pizza_hr', # looks like the same case, checked and looks invalid
'dominos_pizza_ec', # not checked is it invalid, assumed to be for now
'dominos_pizza_bh', # https://github.com/alltheplaces/alltheplaces/issues/11160
'dominos_pizza_hr', # looks like the same case, checked and looks invalid
'dominos_pizza_ec', # not checked is it invalid, assumed to be for now
]
if key == "type":
return atp_code not in [
'kaiser_permanente_us', # https://github.com/alltheplaces/alltheplaces/pull/11785 TODO merged
'kaiser_permanente_us', # https://github.com/alltheplaces/alltheplaces/pull/11785 TODO merged
]
if key == "addr:housenumber":
return atp_code not in [
'worldcat', # https://github.com/alltheplaces/alltheplaces/issues/11198
'tradelink_au', # just a single case, report after this list is clean
'popeyes_sg', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'vida_e_caffe', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'salvos_au', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'banxico_mx', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'ray_white_au_nz', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'orlen', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'ulybka_radugi_ru', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'charge_place_scotland_gb', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'ship_and_go_ro', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'worldcat', # https://github.com/alltheplaces/alltheplaces/issues/11198
'tradelink_au', # just a single case, report after this list is clean
'popeyes_sg', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'vida_e_caffe', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'salvos_au', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'banxico_mx', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'ray_white_au_nz', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'orlen', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'ulybka_radugi_ru', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'charge_place_scotland_gb', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'ship_and_go_ro', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
]
if key == "addr:full":
# note: use is_addr_full_known_to_be_broken if geocoding should not be attempted there
if atp_code in [
# not blocking geocoding
'rostics_ru', # lower priority, tricky script TODO handle
'brico_ok_it', # see https://github.com/alltheplaces/alltheplaces/pull/11738
'barrhead_travel_gb', # https://github.com/alltheplaces/alltheplaces/pull/11739#issuecomment-2567184196
'western_union', # https://github.com/alltheplaces/alltheplaces/issues/11838
'burger_king_es_pt', # country info missing
'leon', # sometimes country info missing
'gov_cma_fuel_gb', # sometimes includes place name, sometimes not
'vic_free_wifi_au', # mostly street address... Still, free wifi, not POIs - not going to care about this
'accor', # sometimes includes place name, sometimes not - rather small number is affected (?)
'uniqlo', # seems to include city names, at least in English. Still, missing countries
'dominos_pizza_om', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'berlin_doner_kebap_pl', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'baskin_robbins_ca', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'the_courier_guy_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'pizza_hut_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'costa_coffee', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mcdonalds_es', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'clothing_junction_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lhw', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'gov_dfe_gias_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'romans_pizza', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'newyorker', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'amcal_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'glassons', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'ctm', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'columbia_us', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'subway_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'russells_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lukoil', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'blooms_the_chemist_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'dominos_pizza_ae', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'rage_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'easypay', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mr_liquor_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'library_institute_hu', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mtexx_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'hyundai_bw_ls_na_sz_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'gridserve_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'coffee_like', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lewis_stores', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lilly_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'dunkin_sa', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mcdonalds_hk', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'us_army_national_guard', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
't_market_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'texaco_central_america', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'wesola_pani', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'ptt_th', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'baby_bunting', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'kfc_sg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'cooplands_doncaster_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'jack_wills_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'bras_n_things', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mer_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'wingstop_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'safeway_ca', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'papa_johns_az', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'kia_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'burger_king_cy', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'dusk_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'agnvet_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'real_hu', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'burger_king_il', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'burger_king_cn', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'church_of_england_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'magasin_vert_fr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'cobasi_br', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'i_and_g_brokers', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'ewiva_it', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'rostics_ru', # lower priority, tricky script TODO handle
'brico_ok_it', # see https://github.com/alltheplaces/alltheplaces/pull/11738
'barrhead_travel_gb', # https://github.com/alltheplaces/alltheplaces/pull/11739#issuecomment-2567184196
'western_union', # https://github.com/alltheplaces/alltheplaces/issues/11838
'burger_king_es_pt', # country info missing
'leon', # sometimes country info missing
'gov_cma_fuel_gb', # sometimes includes place name, sometimes not
'vic_free_wifi_au', # mostly street address... Still, free wifi, not POIs - not going to care about this
'accor', # sometimes includes place name, sometimes not - rather small number is affected (?)
'uniqlo', # seems to include city names, at least in English. Still, missing countries
'dominos_pizza_om', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'berlin_doner_kebap_pl', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'baskin_robbins_ca', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'the_courier_guy_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'pizza_hut_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'costa_coffee', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mcdonalds_es', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'clothing_junction_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lhw', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'gov_dfe_gias_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'romans_pizza', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'newyorker', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'amcal_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'glassons', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'ctm', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'columbia_us', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'subway_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'russells_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lukoil', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'blooms_the_chemist_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'dominos_pizza_ae', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'rage_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'easypay', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mr_liquor_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'library_institute_hu', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mtexx_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'hyundai_bw_ls_na_sz_za', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'gridserve_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'coffee_like', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lewis_stores', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'lilly_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'dunkin_sa', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mcdonalds_hk', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'us_army_national_guard', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
't_market_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'texaco_central_america', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'wesola_pani', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'ptt_th', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'baby_bunting', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'kfc_sg', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'cooplands_doncaster_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'jack_wills_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'bras_n_things', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'mer_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'wingstop_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'safeway_ca', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'papa_johns_az', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'kia_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'burger_king_cy', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'dusk_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'agnvet_au', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'real_hu', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'burger_king_il', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'burger_king_cn', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'church_of_england_gb', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'magasin_vert_fr', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'cobasi_br', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'i_and_g_brokers', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
'ewiva_it', # https://github.com/alltheplaces/alltheplaces/issues/11797 - just few entries
]:
return False
if atp_code in [
@ -187,9 +192,9 @@ def is_bogus_key_worth_mentioning(key, atp_code):
'familymart_tw',
'starbucks_jp',
]:
return False # tricky case - partially false positive (nonASCII), has also city, just country missing etc. Lets ignore this.
return False # tricky case - partially false positive (nonASCII), has also city, just country missing etc. Lets ignore this.
if atp_code.endswith('_ru') or atp_code.endswith('_tw') or atp_code.endswith('_jp') or atp_code.endswith('_eg'):
return False # likely tricky - TODO handle
return False # likely tricky - TODO handle
if is_addr_full_known_to_be_broken(atp_code):
return False
return allow_normal_priority_atp_logging()
@ -200,32 +205,32 @@ def is_bogus_key_worth_mentioning(key, atp_code):
return False
if key == "addr:full:en":
if atp_code in [
'starbucks_cn', # https://github.com/alltheplaces/alltheplaces/issues/11870
'starbucks_cn', # https://github.com/alltheplaces/alltheplaces/issues/11870
]:
return False
return allow_low_priority_atp_logging()
if key == "addr:street_address:en":
if atp_code in [
'starbucks_cn', # https://github.com/alltheplaces/alltheplaces/issues/11870
'starbucks_cn', # https://github.com/alltheplaces/alltheplaces/issues/11870
]:
return False
return allow_low_priority_atp_logging()
if key == "addr:district":
return atp_code not in [
'kooperatifmarket_tr', # https://github.com/alltheplaces/alltheplaces/issues/11184
'kooperatifmarket_tr', # https://github.com/alltheplaces/alltheplaces/issues/11184
]
if key == "ref":
return atp_code not in [
'kooperatifmarket_tr', # https://github.com/alltheplaces/alltheplaces/issues/11184
'tui', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'texas_department_of_transportation_us', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'northern_california_breweries', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'teboil_ru', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'burger_king_tr', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'kooperatifmarket_tr', # https://github.com/alltheplaces/alltheplaces/issues/11184
'tui', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'texas_department_of_transportation_us', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'northern_california_breweries', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'teboil_ru', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
'burger_king_tr', # https://github.com/alltheplaces/alltheplaces/issues/11790 - whitespace (fixed by my processing)
]
if key == "note":
return atp_code not in [
'stadt_zuerich_ch', # https://github.com/alltheplaces/alltheplaces/issues/11182
'stadt_zuerich_ch', # https://github.com/alltheplaces/alltheplaces/issues/11182
]
if key == "contact:linkedin":
return True
@ -239,23 +244,23 @@ def is_bogus_key_worth_mentioning(key, atp_code):
return atp_code not in [
]
if key == "website":
return False # https://github.com/alltheplaces/alltheplaces/issues/11736
return False # https://github.com/alltheplaces/alltheplaces/issues/11736
return atp_code not in [
'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415
'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409
'cheddars_scratch_kitchen', # https://github.com/alltheplaces/alltheplaces/issues/11205
'nravizza_by', # https://github.com/alltheplaces/alltheplaces/pull/11707 - TODO merged
'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415
'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409
'cheddars_scratch_kitchen', # https://github.com/alltheplaces/alltheplaces/issues/11205
'nravizza_by', # https://github.com/alltheplaces/alltheplaces/pull/11707 - TODO merged
]
if key == "website:menu":
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
if key == "website:orders":
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
if key == "contact:instagram":
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
if key == "contact:twitter":
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
return False # waiting for https://github.com/alltheplaces/alltheplaces/issues/11736 first
if key == "website:jp":
return False # low priority ATP bug, lets not deal with it for now...
return False # low priority ATP bug, lets not deal with it for now...
return atp_code not in [
]
if key in ["ref", "operator:website"]:
@ -266,7 +271,7 @@ def is_bogus_key_worth_mentioning(key, atp_code):
def is_mismatching_name_worth_mentioning(atp_code):
if atp_code in [
#'lewiatan_pl', # https://www.openstreetmap.org/note/4436169
# 'lewiatan_pl', # https://www.openstreetmap.org/note/4436169
# see also https://www.openstreetmap.org/note/4349666 for Lewiatan
# and https://www.openstreetmap.org/note/4349667
'blue_bottle_liquors_za', # requires local knowledge - ask after higher ranked ones (including PRs) are processed
@ -281,12 +286,13 @@ def is_mismatching_name_worth_mentioning(atp_code):
'totally_workwear_au', 'bi_mart_us', 'carls_jr_au', 'crocs_za',
# OpenStreetMap is wrong
'vinmonopolet_no' # https://github.com/alltheplaces/alltheplaces/pull/10982#issuecomment-2402392856
'vinmonopolet_no' # https://github.com/alltheplaces/alltheplaces/pull/10982#issuecomment-2402392856
]:
return False
# https://github.com/alltheplaces/alltheplaces/issues/11015
return False
def is_missing_main_tag_worth_mentioning(atp_code):
already_known = [
'king_kullen_us', # https://github.com/alltheplaces/alltheplaces/issues/10987
@ -296,11 +302,11 @@ def is_missing_main_tag_worth_mentioning(atp_code):
'kia_us', # https://github.com/alltheplaces/alltheplaces/issues/10885 (tricky, waits)
'bonita', # https://github.com/alltheplaces/alltheplaces/issues/10934
'quality_dairy_us', # https://github.com/alltheplaces/alltheplaces/issues/9660 (blocks non-USA traffic)
'blyzenko_ua', # https://github.com/alltheplaces/alltheplaces/issues/11697
'waterdrop', # https://github.com/alltheplaces/alltheplaces/issues/11698
'house_au', # https://github.com/alltheplaces/alltheplaces/issues/11699
'medical_city_healthcare', # https://github.com/alltheplaces/alltheplaces/issues/11700
'bens_cookies', # https://github.com/alltheplaces/alltheplaces/issues/11701
'blyzenko_ua', # https://github.com/alltheplaces/alltheplaces/issues/11697
'waterdrop', # https://github.com/alltheplaces/alltheplaces/issues/11698
'house_au', # https://github.com/alltheplaces/alltheplaces/issues/11699
'medical_city_healthcare', # https://github.com/alltheplaces/alltheplaces/issues/11700
'bens_cookies', # https://github.com/alltheplaces/alltheplaces/issues/11701
'maserati', # has many actually empty entries
'suzuki_marine_au', # looks tricky
'coop_food_gb', # do not see how info may be recovered, maybe someone with local knowledge can
@ -319,7 +325,7 @@ def is_missing_main_tag_worth_mentioning(atp_code):
if atp_code in already_known:
return False
if len(already_known) > 25:
return False # bottleneck is not in reporting, why care about this?
return False # bottleneck is not in reporting, why care about this?
if allow_low_priority_atp_logging():
raise Exception("take into account date of file to avoid pointless reports, see latest_date_of_file_commit.py file and see https://github.com/alltheplaces/alltheplaces/issues/10990#issuecomment-2404066559")
@ -331,16 +337,16 @@ def is_null_specified_as_text_worth_mentioning(atp_code):
if allow_very_low_priority_atp_logging():
return False
return atp_code not in [
'ship_and_go_ro', # https://github.com/alltheplaces/alltheplaces/issues/11201
'whataburger', # https://github.com/alltheplaces/alltheplaces/issues/11202
'pandora', # https://github.com/alltheplaces/alltheplaces/issues/11203
'ship_and_go_ro', # https://github.com/alltheplaces/alltheplaces/issues/11201
'whataburger', # https://github.com/alltheplaces/alltheplaces/issues/11202
'pandora', # https://github.com/alltheplaces/alltheplaces/issues/11203
]
def is_failed_geocoding_unexpected(atp_code):
if atp_code in [
'tegut_de', # https://github.com/alltheplaces/alltheplaces/issues/9212
'fastned', 'ljsilvers', 'wells_fargo', 'coffee_like', # not bothered with reporting, remove here if someone else fixed some ATP issues I reported
'fastned', 'ljsilvers', 'wells_fargo', 'coffee_like', # not bothered with reporting, remove here if someone else fixed some ATP issues I reported
]:
return False
if is_addr_street_address_known_to_be_broken(atp_code):
@ -353,114 +359,121 @@ def is_failed_geocoding_unexpected(atp_code):
return False
return allow_normal_priority_atp_logging()
def is_addr_postcode_known_to_be_broken(atp_code):
# addr:postcode
return atp_code in [
'century_21', # https://github.com/alltheplaces/alltheplaces/issues/11734
]
def is_addr_street_address_known_to_be_broken(atp_code):
# addr:street_address
return atp_code in [
'skechers', # https://github.com/alltheplaces/alltheplaces/issues/10967
'just_group', # https://github.com/alltheplaces/alltheplaces/issues/10360
'petrol_bg', # https://github.com/alltheplaces/alltheplaces/issues/11186
'mussala_bg', # https://github.com/alltheplaces/alltheplaces/issues/11244
'otr_au', # https://github.com/alltheplaces/alltheplaces/pull/11733#issuecomment-2567035353
'petrol_bg', # https://github.com/alltheplaces/alltheplaces/issues/11186
'mussala_bg', # https://github.com/alltheplaces/alltheplaces/issues/11244
'otr_au', # https://github.com/alltheplaces/alltheplaces/pull/11733#issuecomment-2567035353
]
def is_addr_full_known_to_be_broken(atp_code):
# addr:full
return atp_code in known_broken_addr_full_spiders()
def known_broken_addr_full_spiders():
# addr:full
return [
# 2+ weeks old ones
'harcourts', # https://github.com/alltheplaces/alltheplaces/pull/11703 TODO merged
'retail_apparel_group', # https://github.com/alltheplaces/alltheplaces/pull/11704 TODO merged
'a1_bg', # https://github.com/alltheplaces/alltheplaces/pull/11751 TODO merged
'ifly_ca_us', # https://github.com/alltheplaces/alltheplaces/pull/11760 TODO merged
'buildit', # https://github.com/alltheplaces/alltheplaces/pull/11757 TODO merged
'wells_fargo', # https://github.com/alltheplaces/alltheplaces/pull/11750 TODO merged
'vallarta_us', # https://github.com/alltheplaces/alltheplaces/pull/11737 TODO merged
'harcourts', # https://github.com/alltheplaces/alltheplaces/pull/11703 TODO merged
'retail_apparel_group', # https://github.com/alltheplaces/alltheplaces/pull/11704 TODO merged
'a1_bg', # https://github.com/alltheplaces/alltheplaces/pull/11751 TODO merged
'ifly_ca_us', # https://github.com/alltheplaces/alltheplaces/pull/11760 TODO merged
'buildit', # https://github.com/alltheplaces/alltheplaces/pull/11757 TODO merged
'wells_fargo', # https://github.com/alltheplaces/alltheplaces/pull/11750 TODO merged
'vallarta_us', # https://github.com/alltheplaces/alltheplaces/pull/11737 TODO merged
'play_pl', # https://github.com/alltheplaces/alltheplaces/pull/11796 TODO merged
'rodda_paint_us', # https://github.com/alltheplaces/alltheplaces/pull/11795 TODO merged
'ziko_apteka_pl', # https://github.com/alltheplaces/alltheplaces/pull/11794 TODO merged
'videopro_au', # https://github.com/alltheplaces/alltheplaces/pull/11793 TODO merged
'plus_pl', # https://github.com/alltheplaces/alltheplaces/pull/11779 TODO merged
'carinos', # https://github.com/alltheplaces/alltheplaces/pull/11780 TODO merged
'ljsilvers', # https://github.com/alltheplaces/alltheplaces/pull/11781 TODO merged
'nissan_cz', # https://github.com/alltheplaces/alltheplaces/pull/11782 TODO merged
'thelins_konditori_se', # https://github.com/alltheplaces/alltheplaces/pull/11783 TODO merged
'move_yourself_au', # https://github.com/alltheplaces/alltheplaces/pull/11792 TODO merged
'swedbank_ee', # https://github.com/alltheplaces/alltheplaces/pull/11805 TODO merged
'paint_spot_au', # https://github.com/alltheplaces/alltheplaces/pull/11801 TODO merged
'gamestop', # https://github.com/alltheplaces/alltheplaces/pull/11817 TODO merged
'dsk_bank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11810 TODO merged
'empik_pl', # https://github.com/alltheplaces/alltheplaces/pull/11814 TODO merged
'coop_alleanza_it', # https://github.com/alltheplaces/alltheplaces/pull/11815 TODO merged
'two_men_and_a_truck', # https://github.com/alltheplaces/alltheplaces/issues/11797 https://github.com/alltheplaces/alltheplaces/pull/11816 TODO merged
'mcdonalds_latin_america', # https://github.com/alltheplaces/alltheplaces/pull/11863 TODO merged
'petstock_au', # https://github.com/alltheplaces/alltheplaces/pull/11864 TODO merged
'crown_decorating_centres_gb', # https://github.com/alltheplaces/alltheplaces/pull/11865 TODO merged
'torchys_tacos', # https://github.com/alltheplaces/alltheplaces/pull/11866 TODO merged
'kaisercraft_au', # https://github.com/alltheplaces/alltheplaces/pull/11867 TODO merged
'tag_heuer', # https://github.com/alltheplaces/alltheplaces/pull/11869 TODO merged
'fastned', # https://github.com/alltheplaces/alltheplaces/pull/11857 TODO merged
'snap_fitness', # https://github.com/alltheplaces/alltheplaces/pull/11858 TODO merged
'kfc_it', # https://github.com/alltheplaces/alltheplaces/pull/11859 TODO merged
'tops', # https://github.com/alltheplaces/alltheplaces/pull/11860 TODO merged
'ccbank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11861 TODO merged
'ymca', # https://github.com/alltheplaces/alltheplaces/issues/11797 https://github.com/alltheplaces/alltheplaces/pull/11819 TODO merged
'wingstop', # https://github.com/alltheplaces/alltheplaces/pull/11818 TODO merged
'byd_auto_au', # https://github.com/alltheplaces/alltheplaces/pull/11854 TODO merged
'unicredit_bulbank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11884 TODO merged
'alaska_commercial_company', # https://github.com/alltheplaces/alltheplaces/pull/11909 TODO merged
'thiele_dk', # https://github.com/alltheplaces/alltheplaces/pull/11889 TODO merged
'equatorial_coffee_za', # https://github.com/alltheplaces/alltheplaces/pull/11881 TODO merged
'toyota_au', # https://github.com/alltheplaces/alltheplaces/pull/11800 TODO merged
'play_pl', # https://github.com/alltheplaces/alltheplaces/pull/11796 TODO merged
'rodda_paint_us', # https://github.com/alltheplaces/alltheplaces/pull/11795 TODO merged
'ziko_apteka_pl', # https://github.com/alltheplaces/alltheplaces/pull/11794 TODO merged
'videopro_au', # https://github.com/alltheplaces/alltheplaces/pull/11793 TODO merged
'plus_pl', # https://github.com/alltheplaces/alltheplaces/pull/11779 TODO merged
'carinos', # https://github.com/alltheplaces/alltheplaces/pull/11780 TODO merged
'ljsilvers', # https://github.com/alltheplaces/alltheplaces/pull/11781 TODO merged
'nissan_cz', # https://github.com/alltheplaces/alltheplaces/pull/11782 TODO merged
'thelins_konditori_se', # https://github.com/alltheplaces/alltheplaces/pull/11783 TODO merged
'move_yourself_au', # https://github.com/alltheplaces/alltheplaces/pull/11792 TODO merged
'swedbank_ee', # https://github.com/alltheplaces/alltheplaces/pull/11805 TODO merged
'paint_spot_au', # https://github.com/alltheplaces/alltheplaces/pull/11801 TODO merged
'gamestop', # https://github.com/alltheplaces/alltheplaces/pull/11817 TODO merged
'dsk_bank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11810 TODO merged
'empik_pl', # https://github.com/alltheplaces/alltheplaces/pull/11814 TODO merged
'coop_alleanza_it', # https://github.com/alltheplaces/alltheplaces/pull/11815 TODO merged
'two_men_and_a_truck', # https://github.com/alltheplaces/alltheplaces/issues/11797 https://github.com/alltheplaces/alltheplaces/pull/11816 TODO merged
'mcdonalds_latin_america', # https://github.com/alltheplaces/alltheplaces/pull/11863 TODO merged
'petstock_au', # https://github.com/alltheplaces/alltheplaces/pull/11864 TODO merged
'crown_decorating_centres_gb', # https://github.com/alltheplaces/alltheplaces/pull/11865 TODO merged
'torchys_tacos', # https://github.com/alltheplaces/alltheplaces/pull/11866 TODO merged
'kaisercraft_au', # https://github.com/alltheplaces/alltheplaces/pull/11867 TODO merged
'tag_heuer', # https://github.com/alltheplaces/alltheplaces/pull/11869 TODO merged
'fastned', # https://github.com/alltheplaces/alltheplaces/pull/11857 TODO merged
'snap_fitness', # https://github.com/alltheplaces/alltheplaces/pull/11858 TODO merged
'kfc_it', # https://github.com/alltheplaces/alltheplaces/pull/11859 TODO merged
'tops', # https://github.com/alltheplaces/alltheplaces/pull/11860 TODO merged
'ccbank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11861 TODO merged
'ymca', # https://github.com/alltheplaces/alltheplaces/issues/11797 https://github.com/alltheplaces/alltheplaces/pull/11819 TODO merged
'wingstop', # https://github.com/alltheplaces/alltheplaces/pull/11818 TODO merged
'byd_auto_au', # https://github.com/alltheplaces/alltheplaces/pull/11854 TODO merged
'unicredit_bulbank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11884 TODO merged
'alaska_commercial_company', # https://github.com/alltheplaces/alltheplaces/pull/11909 TODO merged
'thiele_dk', # https://github.com/alltheplaces/alltheplaces/pull/11889 TODO merged
'equatorial_coffee_za', # https://github.com/alltheplaces/alltheplaces/pull/11881 TODO merged
'toyota_au', # https://github.com/alltheplaces/alltheplaces/pull/11800 TODO merged
'paris_baguette_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797
'easybox_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797
'twin_peaks', # some are not street address either
'coen_markets_us', # https://github.com/alltheplaces/alltheplaces/issues/11797
'seven_eleven_ph', # https://github.com/alltheplaces/alltheplaces/issues/11804
'sherwin_williams', # https://github.com/alltheplaces/alltheplaces/issues/11797
'spar_aspiag', # https://github.com/alltheplaces/alltheplaces/issues/11797
'united_dairy_farmers_us', # https://github.com/alltheplaces/alltheplaces/issues/11797
'woops_us', # https://github.com/alltheplaces/alltheplaces/issues/11797
'easybox_hu', # https://github.com/alltheplaces/alltheplaces/issues/11797
'woolworths_za', # https://github.com/alltheplaces/alltheplaces/issues/11146
'philz_coffee_us', # https://github.com/alltheplaces/alltheplaces/issues/11147
'anthropologie', # https://github.com/alltheplaces/alltheplaces/issues/11199
'mussala_bg', # https://github.com/alltheplaces/alltheplaces/issues/11244
'asics_us', # https://github.com/alltheplaces/alltheplaces/issues/11702
'primaprix', # https://github.com/alltheplaces/alltheplaces/issues/11709
'otr_au', # https://github.com/alltheplaces/alltheplaces/pull/11733 and https://github.com/alltheplaces/alltheplaces/pull/11733#issuecomment-2567035353
'break_and_wash_pl', # city name included, country name missing
'brico_ok_it', # https://github.com/alltheplaces/alltheplaces/pull/11738 - look at my own comments
'kay_jewelers', # https://github.com/alltheplaces/alltheplaces/pull/11758
'odido_pl', # includes locations, country missing
'systeme_u', # https://github.com/alltheplaces/alltheplaces/pull/11883
'easybox_ro' # see https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/easybox_ro.py - needs changes to 13_generate_atp_issue_tracker_report.py to autofix it
'paris_baguette_kr', # https://github.com/alltheplaces/alltheplaces/issues/11797
'easybox_bg', # https://github.com/alltheplaces/alltheplaces/issues/11797
'twin_peaks', # some are not street address either
'coen_markets_us', # https://github.com/alltheplaces/alltheplaces/issues/11797
'seven_eleven_ph', # https://github.com/alltheplaces/alltheplaces/issues/11804
'sherwin_williams', # https://github.com/alltheplaces/alltheplaces/issues/11797
'spar_aspiag', # https://github.com/alltheplaces/alltheplaces/issues/11797
'united_dairy_farmers_us', # https://github.com/alltheplaces/alltheplaces/issues/11797
'woops_us', # https://github.com/alltheplaces/alltheplaces/issues/11797
'easybox_hu', # https://github.com/alltheplaces/alltheplaces/issues/11797
'woolworths_za', # https://github.com/alltheplaces/alltheplaces/issues/11146
'philz_coffee_us', # https://github.com/alltheplaces/alltheplaces/issues/11147
'anthropologie', # https://github.com/alltheplaces/alltheplaces/issues/11199
'mussala_bg', # https://github.com/alltheplaces/alltheplaces/issues/11244
'asics_us', # https://github.com/alltheplaces/alltheplaces/issues/11702
'primaprix', # https://github.com/alltheplaces/alltheplaces/issues/11709
'otr_au', # https://github.com/alltheplaces/alltheplaces/pull/11733 and https://github.com/alltheplaces/alltheplaces/pull/11733#issuecomment-2567035353
'break_and_wash_pl', # city name included, country name missing
'brico_ok_it', # https://github.com/alltheplaces/alltheplaces/pull/11738 - look at my own comments
'kay_jewelers', # https://github.com/alltheplaces/alltheplaces/pull/11758
'odido_pl', # includes locations, country missing
'systeme_u', # https://github.com/alltheplaces/alltheplaces/pull/11883
'easybox_ro' # see https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/easybox_ro.py - needs changes to 13_generate_atp_issue_tracker_report.py to autofix it
]
def is_addr_city_known_to_be_broken(atp_code):
# addr:city
return atp_code in known_broken_addr_city_spiders()
def known_broken_addr_city_spiders():
return [
'mussala_bg', # https://github.com/alltheplaces/alltheplaces/issues/11244
'pandora', # https://github.com/alltheplaces/alltheplaces/issues/11245
'mcdonalds_cz', # https://github.com/alltheplaces/alltheplaces/issues/11708 - not utterly invalid apparently
'la_anonima_ar', # https://github.com/alltheplaces/alltheplaces/issues/11786
'shell', # https://github.com/alltheplaces/alltheplaces/issues/11788
'spar_bw_mz_na_sz_za', # https://github.com/alltheplaces/alltheplaces/issues/11789
'mussala_bg', # https://github.com/alltheplaces/alltheplaces/issues/11244
'pandora', # https://github.com/alltheplaces/alltheplaces/issues/11245
'mcdonalds_cz', # https://github.com/alltheplaces/alltheplaces/issues/11708 - not utterly invalid apparently
'la_anonima_ar', # https://github.com/alltheplaces/alltheplaces/issues/11786
'shell', # https://github.com/alltheplaces/alltheplaces/issues/11788
'spar_bw_mz_na_sz_za', # https://github.com/alltheplaces/alltheplaces/issues/11789
]
def is_missing_brand_field_worth_mentioning(atp_code):
if atp_code in [
'lukoil', 'mol', # too complex for me - multibrand spiders
@ -496,10 +509,11 @@ def is_missing_brand_field_worth_mentioning(atp_code):
def missing_brand_wikidata_worth_mentioning(atp_code):
if atp_code in [
'conad_it', # https://github.com/alltheplaces/alltheplaces/issues/11950
'conad_it', # https://github.com/alltheplaces/alltheplaces/issues/11950
]:
return False
return allow_very_low_priority_atp_logging() # https://github.com/alltheplaces/alltheplaces/issues/11950
return allow_very_low_priority_atp_logging() # https://github.com/alltheplaces/alltheplaces/issues/11950
def is_empty_file_for_spider_worth_mentioning(atp_code):
if atp_code in [
@ -600,10 +614,10 @@ def opening_hours_key():
def keys_with_value_link():
returned = ["website", 'website_may_be_broken', "operator:website", 'website:en', 'website:fr', 'website:de', 'website:kr', 'website:cn', 'website:menu', 'website:orders', 'website:orders:en', 'website:orders:ar', 'reservation:website',
'contact:webcam', # should not be contact...
'contact:tripadvisor', 'contact:yelp',
'source:website', 'brand:website', # probably should be eliminated - TODO
]
'contact:webcam', # should not be contact...
'contact:tripadvisor', 'contact:yelp',
'source:website', 'brand:website', # probably should be eliminated - TODO
]
for code in language_tag_knowledge.all_iso_639_1_language_codes():
if "website:" + code not in returned:
returned.append("website:" + code)
@ -616,11 +630,13 @@ def keys_with_value_link():
# TODO what about website:orders
# TODO website:fr
def keys_with_possible_link():
return ["image", "@source_uri", "atp_ref", 'ref', "contact:facebook", 'contact:youtube', 'contact:yelp', 'contact:twitter', 'contact:instagram', 'contact:linkedin', 'contact:tiktok', 'contact:tripadvisor', 'operator:facebook', 'operator:twitter',
'icon', # also in atp_tags_to_be_remove_completely_and_ignored
]
'icon', # also in atp_tags_to_be_remove_completely_and_ignored
]
def atp_tags_very_likely_not_usable_for_osm_import(tags):
returned = [
@ -663,7 +679,7 @@ def atp_tags_very_likely_not_usable_for_osm_import(tags):
# bogus name tags has a higher priority anyway
#
# so lets throw it away without even attempts to use it
'branch:en', # like branch
'branch:en', # like branch
# extra detail not worth adding to OSM
# fluctuates wildly
@ -731,7 +747,7 @@ def atp_tags_to_be_remove_completely_and_ignored():
"nsi_id", # internal data
# repeated entries that are not useful (at least for me and OSM) that ATP wants to keep
'brand:logo', 'icon', # https://github.com/alltheplaces/alltheplaces/issues/11183
'brand:logo', 'icon', # https://github.com/alltheplaces/alltheplaces/issues/11183
# more keys to be removed
'storeClass', 'owner:type', 'ownership_type', 'kioskType', 'operator:facebook', 'operator:twitter',
@ -745,6 +761,7 @@ def atp_tags_to_be_remove_completely_and_ignored():
def dubious_keys_raising_alarm():
return generic_type_keys()
def generic_type_keys():
return ['location_type', 'store_type', 'storeType', 'type']
@ -786,28 +803,28 @@ def ignored_atp_codes():
# https://github.com/osmlab/name-suggestion-index/tags
# (currently none)
'department_veterans_affairs', # https://github.com/alltheplaces/alltheplaces/pull/11905 TODO MERGED
'billa', # https://github.com/alltheplaces/alltheplaces/pull/11706 TODO MERGED
'coop_centro_italia_it', # https://github.com/alltheplaces/alltheplaces/pull/11942 TODO_MERGED
'sony_gb', # https://github.com/alltheplaces/alltheplaces/issues/11710 TODO_MERGED
'topgolf_us', # https://github.com/alltheplaces/alltheplaces/pull/11923 TODO_MERGED
'department_veterans_affairs', # https://github.com/alltheplaces/alltheplaces/pull/11905 TODO MERGED
'billa', # https://github.com/alltheplaces/alltheplaces/pull/11706 TODO MERGED
'coop_centro_italia_it', # https://github.com/alltheplaces/alltheplaces/pull/11942 TODO_MERGED
'sony_gb', # https://github.com/alltheplaces/alltheplaces/issues/11710 TODO_MERGED
'topgolf_us', # https://github.com/alltheplaces/alltheplaces/pull/11923 TODO_MERGED
'puebloweb_pr_us', # https://github.com/alltheplaces/alltheplaces/pull/11908
'cinnabon_ru', # cafe or fast food? maybe these should be considered as matching? TODO - see file:///media/mateusz/OSM_cache/ATP_matcher_cache/output_for_global_scan/missing_shops__cinnabon_ru.html (generated by 14_...) and https://www.openstreetmap.org/search?query=Cinnabon%2C+Russia#map=19/55.768162/37.598584
'benchmarx_gb', # TODO_LOW_PRIORITY https://www.benchmarxkitchens.co.uk/branches/llanelli - "Located inside Travis Perkins" - should it be mapped as a separate shop then?
'sklavenitis_gr', # https://github.com/alltheplaces/alltheplaces/pull/11904
'halia_baluvana_ua', # https://github.com/alltheplaces/alltheplaces/issues/11902
'spring_market_us', # https://github.com/alltheplaces/alltheplaces/issues/11903
'tesla', # https://github.com/alltheplaces/alltheplaces/issues/11711
'oscar_wylee', # https://github.com/alltheplaces/alltheplaces/issues/11862
'okta_mk', # https://github.com/alltheplaces/alltheplaces/pull/11899
'dierbergs', # https://github.com/alltheplaces/alltheplaces/pull/11898
'mirabito_us', # https://github.com/alltheplaces/alltheplaces/issues/11900
'puebloweb_pr_us', # https://github.com/alltheplaces/alltheplaces/pull/11908
'cinnabon_ru', # cafe or fast food? maybe these should be considered as matching? TODO - see file:///media/mateusz/OSM_cache/ATP_matcher_cache/output_for_global_scan/missing_shops__cinnabon_ru.html (generated by 14_...) and https://www.openstreetmap.org/search?query=Cinnabon%2C+Russia#map=19/55.768162/37.598584
'benchmarx_gb', # TODO_LOW_PRIORITY https://www.benchmarxkitchens.co.uk/branches/llanelli - "Located inside Travis Perkins" - should it be mapped as a separate shop then?
'sklavenitis_gr', # https://github.com/alltheplaces/alltheplaces/pull/11904
'halia_baluvana_ua', # https://github.com/alltheplaces/alltheplaces/issues/11902
'spring_market_us', # https://github.com/alltheplaces/alltheplaces/issues/11903
'tesla', # https://github.com/alltheplaces/alltheplaces/issues/11711
'oscar_wylee', # https://github.com/alltheplaces/alltheplaces/issues/11862
'okta_mk', # https://github.com/alltheplaces/alltheplaces/pull/11899
'dierbergs', # https://github.com/alltheplaces/alltheplaces/pull/11898
'mirabito_us', # https://github.com/alltheplaces/alltheplaces/issues/11900
# https://github.com/alltheplaces/alltheplaces/issues/11712
# non-string values
'mochachos',
'eathappy', # https://github.com/alltheplaces/alltheplaces/pull/11137 - author notified
'eathappy', # https://github.com/alltheplaces/alltheplaces/pull/11137 - author notified
'indigo',
'opendata_mos_hotels_ru',
'marriott_hotels',
@ -822,17 +839,17 @@ def ignored_atp_codes():
# after merge or fix move to
# 14_generate_atp_issue_reports_about_poorly_matched_entries.py
# until lists are regenerated
'kafkas_gr', # Kafkas vs ΚΑΥΚΑΣ, electrical vs electronic... https://github.com/alltheplaces/alltheplaces/issues/11041
'hirebase_gb', # ask people is separate mapping like https://www.openstreetmap.org/node/8948452036 preferable
'revolution_laundry', # is https://stores.revolution-laundry.com/fr-fr/france-fra/saint-honore-les-bains/laverie-revolution-laundry-89271199 shop=laundry self_service=yes ? Or some type of vending machine?
'waynes_coffee', # https://www.openstreetmap.org/note/4476353
'penske', # truck rental mapped as shop=rental, in OSM it seems to be mapped as amenity=car_rental
'kafkas_gr', # Kafkas vs ΚΑΥΚΑΣ, electrical vs electronic... https://github.com/alltheplaces/alltheplaces/issues/11041
'hirebase_gb', # ask people is separate mapping like https://www.openstreetmap.org/node/8948452036 preferable
'revolution_laundry', # is https://stores.revolution-laundry.com/fr-fr/france-fra/saint-honore-les-bains/laverie-revolution-laundry-89271199 shop=laundry self_service=yes ? Or some type of vending machine?
'waynes_coffee', # https://www.openstreetmap.org/note/4476353
'penske', # truck rental mapped as shop=rental, in OSM it seems to be mapped as amenity=car_rental
# why not detected in 14? Not enough pharmacies?
'dia_es', # https://github.com/alltheplaces/alltheplaces/issues/11253
'dia_es', # https://github.com/alltheplaces/alltheplaces/issues/11253
'dodo_pizza', # https://github.com/alltheplaces/alltheplaces/issues/11066
'van_wall_us', # https://github.com/alltheplaces/alltheplaces/issues/11071
'dodo_pizza', # https://github.com/alltheplaces/alltheplaces/issues/11066
'van_wall_us', # https://github.com/alltheplaces/alltheplaces/issues/11071
# TODO: likely needs different alphabet, remove here and rerun
# 14_generate_atp_issue_reports_about_poorly_matched_entries.py
@ -841,30 +858,30 @@ def ignored_atp_codes():
# remove and run following code to explore
# 14_generate_atp_issue_reports_about_poorly_matched_entries.py
'longchamp_eu', # require investigating proper shop value
'longchamp_eu', # require investigating proper shop value
'mikucha_th', # claimed website tags seem to be 404ing
'patisserie_valerie_gb', # at least atp_ref = 1057031 has bad website tag - links mall
'right_at_home_gb', # reinvestigate tagging
'crocs_es', # is it even listing standalone shops? From quick check: no
'super_jimat_my', # confusing mix of two brands
'sushi_express_tw', # poor match, foreign letters, not investigated further
'la_vie_claire_fr', # in OSM seems listed as shop=convenience/supermarket
'united_surgical_partners_international', # seems to miss actual names
'asics_eu', # data looks dubious, very poor matching
'boscovs_us', # locations mismatch locations on website - recheck in some time
'landi_ch', # OSM IRC: 'Landi are a weird category of shops, supermarket is overstating it, country_store is not entirely true as Landi also sells food like a convenience store…' - see also https://github.com/alltheplaces/alltheplaces/pull/11097
'mikucha_th', # claimed website tags seem to be 404ing
'patisserie_valerie_gb', # at least atp_ref = 1057031 has bad website tag - links mall
'right_at_home_gb', # reinvestigate tagging
'crocs_es', # is it even listing standalone shops? From quick check: no
'super_jimat_my', # confusing mix of two brands
'sushi_express_tw', # poor match, foreign letters, not investigated further
'la_vie_claire_fr', # in OSM seems listed as shop=convenience/supermarket
'united_surgical_partners_international', # seems to miss actual names
'asics_eu', # data looks dubious, very poor matching
'boscovs_us', # locations mismatch locations on website - recheck in some time
'landi_ch', # OSM IRC: 'Landi are a weird category of shops, supermarket is overstating it, country_store is not entirely true as Landi also sells food like a convenience store…' - see also https://github.com/alltheplaces/alltheplaces/pull/11097
'soeder_ch', # https://github.com/alltheplaces/alltheplaces/issues/10860
'coop_se', # https://github.com/alltheplaces/alltheplaces/issues/10890
'brookshires_us', # https://github.com/alltheplaces/alltheplaces/pull/11065#pullrequestreview-2369249680
'brookshires_us', # https://github.com/alltheplaces/alltheplaces/pull/11065#pullrequestreview-2369249680
'moneygram', # https://github.com/alltheplaces/alltheplaces/issues/6784
'bricofer_it', # https://community.openstreetmap.org/t/italia-aggiunta-al-matcher-sperimentale-di-all-the-places/117208/8
'ocharleys_us', # https://github.com/alltheplaces/alltheplaces/issues/10995
'pizza_express_gb', # https://github.com/alltheplaces/alltheplaces/issues/11007
'dhl_express_us_ca', # https://github.com/alltheplaces/alltheplaces/issues/11009
'ocharleys_us', # https://github.com/alltheplaces/alltheplaces/issues/10995
'pizza_express_gb', # https://github.com/alltheplaces/alltheplaces/issues/11007
'dhl_express_us_ca', # https://github.com/alltheplaces/alltheplaces/issues/11009
"chorten_pl", # not actually branded, huge offsets
# https://www.openstreetmap.org/?mlat=50.09744263&mlon=19.99892616#map=19/50.09744/19.99893 https://www.openstreetmap.org/node/1948024743 - 240m offset from its stated address
'day_today_gb', # about 10% are outdated entries - see https://github.com/alltheplaces/alltheplaces/discussions/10941#discussioncomment-10883201 and https://github.com/alltheplaces/alltheplaces/pull/10974
# https://www.openstreetmap.org/?mlat=50.09744263&mlon=19.99892616#map=19/50.09744/19.99893 https://www.openstreetmap.org/node/1948024743 - 240m offset from its stated address
'day_today_gb', # about 10% are outdated entries - see https://github.com/alltheplaces/alltheplaces/discussions/10941#discussioncomment-10883201 and https://github.com/alltheplaces/alltheplaces/pull/10974
"krakow_public_transport_vending_krk_pl", # requires matching by object type
"abc_pl", # cannot be assumed to have brands, effectively dropped from ATP by qa.py anyway (once ATP branch vs name is solved it can be used again)
'cukiernia_sowa_pl', # often amenity=cafe, not sure how to handle
@ -873,19 +890,19 @@ def ignored_atp_codes():
"nsw_ambulance_au", # no brand, not sure is brand applying to those...
"victorian_government_road_safety_cameras_au", # no brand and it is really correct
'mall_maverick', # suspect and low quality data, anyway has no brand fields
'opel_rent_de', # looks like an aspect of a car rental place
'opel_rent_de', # looks like an aspect of a car rental place
'asian_paints_beautiful_homes_in', # nonsense locations in ocean, no trust in other ones
'kfc_hk', # bogus locations
'botteg_aveneta', # Heathrow Airport Terminal 4, Hounslow, Middlesex put into central London, under London node (atp_ref = 60226)
"forever_new_in", # I see location in China...
'asian_paints_beautiful_homes_in', # nonsense locations in ocean, no trust in other ones
'kfc_hk', # bogus locations
'botteg_aveneta', # Heathrow Airport Terminal 4, Hounslow, Middlesex put into central London, under London node (atp_ref = 60226)
"forever_new_in", # I see location in China...
'cvs_us', # very confusing data format
# store_type = MinuteClinic ?
# why pharmacy has supermarket?
# and their website is blocked for me in Poland, I get https://www.cvs.com/international.html
'trade_point_gb', # this spider lists counters in actual shops, rather than standalone shops, has confusing store_type field
'trade_point_gb', # this spider lists counters in actual shops, rather than standalone shops, has confusing store_type field
# see also https://github.com/osmlab/name-suggestion-index/issues/10028
@ -898,8 +915,8 @@ def ignored_atp_codes():
# code author got notified
# Cj-Malone
'gbfs', # https://github.com/alltheplaces/alltheplaces/issues/11008
'leon', # https://github.com/alltheplaces/alltheplaces/issues/11952
'gbfs', # https://github.com/alltheplaces/alltheplaces/issues/11008
'leon', # https://github.com/alltheplaces/alltheplaces/issues/11952
# car shops
# note: qa.py started throwing out all shop=car and shop=car_repair in general
@ -908,25 +925,26 @@ def ignored_atp_codes():
'renault', # https://github.com/alltheplaces/alltheplaces/issues/10244
'kia', # show only aspect of dealer, see say https://www.openstreetmap.org/way/400059648 where they have multiple brands
'mercedes_benz_group', # the same, see atp_ref = GS0008313 - in such case https://www.openstreetmap.org/way/639491972 should not get website = http://www.mercedes-benz-ibach.ch/
'hyundai_no', # again, listing multi-brand car repair locations as Hyundai-specific
'hyundai_no', # again, listing multi-brand car repair locations as Hyundai-specific
# see https://www.openstreetmap.org/node/10199475251 https://www.hyundai.com/no/no/kjop/bil/forhandlere/fredrikstad.html
'hyundai_de',
'hyundai_us',
'super_dekk_no', # aspect of car_repair shops(s), dubious that it should be mapped separately
'super_dekk_no', # aspect of car_repair shops(s), dubious that it should be mapped separately
# unclear licensing situation
# see https://github.com/alltheplaces/alltheplaces/issues/8790
'cbrfree_au', 'james_retail_gb', 'queensland_government_road_safety_cameras_au',
'terrible_herbst', 'thales_fr',
'worldcat', # https://github.com/alltheplaces/alltheplaces/pull/10923#issuecomment-2397362271
'worldcat', # https://github.com/alltheplaces/alltheplaces/pull/10923#issuecomment-2397362271
]
def processing_plan(): # TODO remove this vestigal function
def processing_plan(): # TODO remove this vestigal function
returned = {}
known_data = shared.country_data()
requested_codes = os.getenv("OSM_ATM_MATCHER_COUNTRY_CODE_LIST").split(",") # TODO - this is surely not used anymore? and can be removed from .env ?
requested_codes = os.getenv("OSM_ATM_MATCHER_COUNTRY_CODE_LIST").split(",") # TODO - this is surely not used anymore? and can be removed from .env ?
for code in requested_codes:
for name, data in known_data.items():
if data['country_code'] == code:
@ -941,13 +959,16 @@ def good_match_distance_in_kilometers():
def maximum_missing_shop_distance_in_kilometers():
return 0.9
def default_missing_shop_distance_in_kilometers():
return maximum_missing_shop_distance_in_kilometers() / 3
def increased_missing_shop_distance_in_kilometers():
# for say supermarkets
return maximum_missing_shop_distance_in_kilometers()
def missing_shop_distance_in_kilometers_for_specific_case(object_tags, spider_code=None):
if spider_code == None:
spider_code = object_tags["@spider"]
@ -977,7 +998,7 @@ def the_same_feature_type(tags_a, tags_b):
values.sort()
conflict = values[0] + " vs " + values[1]
if conflict in matching_rather_than_type_conflict():
return True # dubious conflict, lets report a match
return True # dubious conflict, lets report a match
if conflict not in clear_type_conflicts() and conflict not in undecided_type_conflicts():
name_a = tags_a.get("name", None)
if name_a == None:
@ -1011,6 +1032,7 @@ def the_same_feature_type(tags_a, tags_b):
raise
return False
def undecided_type_conflicts():
# TODO: put them into clear_type_conflicts() or into matching_rather_than_type_conflict()
return [
@ -1038,6 +1060,7 @@ def undecided_type_conflicts():
'amenity=pharmacy vs healthcare=audiologist',
]
def clear_type_conflicts():
"""
to help detecting dubious conflicts
@ -3686,6 +3709,7 @@ def clear_type_conflicts():
# TODO - autogenerate more of these?
]
def matching_rather_than_type_conflict():
return [
'shop=chocolate vs sweet bakery',
@ -3725,14 +3749,15 @@ def matching_rather_than_type_conflict():
'shop=gift vs shop=stationery',
'office=accountant vs office=consulting',
'office=accountant vs office=tax_advisor',
'shop=electrical vs shop=electronics', # maybe even merge into one group
'shop=electrical vs shop=electronics', # maybe even merge into one group
]
def canonical_feature(object_tags):
# TODO: maybe should match
# shop=car vs shop=motorcycle - wait, shop=car is banned, right? what about shop=motorcycle
# amenity=bank vs office=financial
# amenity=bank vs amenity=money_transfer
"""
return string to allow comparing object types
@ -3819,23 +3844,29 @@ def return_info_about_spider_to_debug_it(atp_code):
returned += atp_unpacked_folder() + atp_code + '.geojson'
return returned
def link_to_spider(atp_code):
# TODO This assumption that atp_code matches filename cannot be really made here
# TODO see https://github.com/alltheplaces/alltheplaces/issues/9687
# TODO though as a hackinsh solution that mostly works, it actually works fine
return "locations/spiders/" + atp_code + ".py"
def get_github_link_to_spider(atp_code):
return "https://github.com/alltheplaces/alltheplaces/blob/master/" + link_to_spider(atp_code)
def linkified_markdown_atp(atp_code):
return "[" + atp_code + "](" + get_github_link_to_spider(atp_code) + ")"
def repo_location_atp():
return "../___other/alltheplaces"
def repo_location_nsi():
return "../___other/name-suggestion-index"
def git_user_credit_in_commits():
return os.getenv("GIT_USER_IN_CREDITS_OPTIONAL_VALUE")

View file

@ -7,6 +7,7 @@ import dulwich
import datetime
config = __import__("0_config")
def main():
repo_url = "https://github.com/alltheplaces/alltheplaces"
print(repo_url)
@ -28,6 +29,7 @@ def main():
get_get_and_or_update_repository(repo_url, repo_path)
repository_tag_list(repo_path)
def get_get_and_or_update_repository(repo_url, repo_path):
try:
porcelain.clone(repo_url, repo_path)
@ -36,6 +38,7 @@ def get_get_and_or_update_repository(repo_url, repo_path):
repo = Repo(repo_path)
pull(repo, repo_url)
def repository_tag_list(repo_path):
repo = Repo(repo_path)
@ -60,4 +63,5 @@ def repository_tag_list(repo_path):
tag_time_readable = datetime.datetime.utcfromtimestamp(tag_time).strftime('%Y-%m-%d %H:%M:%S')
print(f"Tag: {tag_name}, Added on: {tag_time_readable}")
main()

View file

@ -8,17 +8,17 @@ obtain_atp_data = __import__("2_obtain_atp_data")
graticule_report = __import__("5_generate_graticule_reports")
raise Exception("payment:mastercard_electronic - investigate") # TODO - investigate this ATP tagging issue
raise Exception("mpesa discussed in https://github.com/alltheplaces/alltheplaces/commit/53b11551a30d16ccc4d16658b7b61bfbf66fe87c#r151203743") # TODO - investigate this ATP tagging issue
raise Exception("fuel:electricity - https://wiki.openstreetmap.org/wiki/Talk:Key:fuel:*#fuel%3Aelectricity https://github.com/alltheplaces/alltheplaces/pull/11934#issuecomment-2585062788 ")# TODO - investigate this ATP tagging issue
raise Exception("payment:mastercard_electronic - investigate") # TODO - investigate this ATP tagging issue
raise Exception("mpesa discussed in https://github.com/alltheplaces/alltheplaces/commit/53b11551a30d16ccc4d16658b7b61bfbf66fe87c#r151203743") # TODO - investigate this ATP tagging issue
raise Exception("fuel:electricity - https://wiki.openstreetmap.org/wiki/Talk:Key:fuel:*#fuel%3Aelectricity https://github.com/alltheplaces/alltheplaces/pull/11934#issuecomment-2585062788 ") # TODO - investigate this ATP tagging issue
raise Exception("""rent:lpg_bottles
asked on IRC:
Does anyone knows how to tag that place (such as fuel station) allows rental/refilling of LPG bottles? I see https://taginfo.openstreetmap.org/keys/rent%3Alpg_bottles but looks like import from chronology
https://wiki.openstreetmap.org/w/index.php?search=Key%3Arent%3Alpg_bottles&title=Special%3ASearch&profile=default&fulltext=1""")# TODO - investigate this ATP tagging issue
raise Exception("is there still brand_wikidata in published ATP after https://github.com/alltheplaces/alltheplaces/pull/11938 ?")# TODO - investigate this ATP tagging issue
https://wiki.openstreetmap.org/w/index.php?search=Key%3Arent%3Alpg_bottles&title=Special%3ASearch&profile=default&fulltext=1""") # TODO - investigate this ATP tagging issue
raise Exception("is there still brand_wikidata in published ATP after https://github.com/alltheplaces/alltheplaces/pull/11938 ?") # TODO - investigate this ATP tagging issue
raise Exception("name:al - https://github.com/alltheplaces/alltheplaces/pull/11939 - merged")
raise Exception("https://github.com/alltheplaces/alltheplaces/pull/11935 - merged")
@ -27,6 +27,7 @@ raise Exception("https://github.com/alltheplaces/alltheplaces/pull/11934 (for fu
raise Exception("secondary - https://github.com/alltheplaces/alltheplaces/pull/11940 - waiting")
raise Exception("https://github.com/alltheplaces/alltheplaces/pull/11936 - waiting - about urgent_care - see healthcare:speciality=urgent at https://wiki.openstreetmap.org/wiki/Key:healthcare:speciality and https://wiki.openstreetmap.org/wiki/Proposal:Urgent_care")
def collect_data():
"""
list undocumented tags being used by ATP and not listed as dubious/skipped/supressed
@ -69,18 +70,20 @@ def collect_data():
used_tags[key][value] += 1
return used_keys, used_tags
def is_freeform_key(key):
if tag_knowledge.is_freeform_key(key):
return True
if key in config.keys_with_value_link():
return True
if key in ['charging_station:output', 'socket:type2_combo'] or key in ["contact:sms", "ref:branch", "website:fr", "website:en", "website:de", "website:it", 'website:orders', "name:zh-Hans", "branch:ar", "addr:full:en", "addr:full:ar", 'addr:province', # TODO, move info about OSM tags upstream
"directions", # ATPism but looks fairly reasonable under ATYL
"@source_uri", # ATP-specific
]:
"directions", # ATPism but looks fairly reasonable under ATYL
"@source_uri", # ATP-specific
]:
return True
return False
def main():
supress_atpisms = []
used_tags_listing = ""

View file

@ -1,17 +1,19 @@
import re
import qa_autofix_atp
import shared
import matcher
import json
import os
import rich
obtain_atp_data = __import__("2_obtain_atp_data")
import matcher
qa = __import__("qa")
config = __import__("0_config")
import shared
import qa_autofix_atp
import re
def threshold(key):
return 100
def expected_unique_keys():
returned = ['branch', 'image', 'phone', 'contact:facebook', 'contact:youtube', 'contact:yelp', 'contact:twitter', 'contact:instagram', 'contact:linkedin', 'contact:tiktok', 'contact:tripadvisor', 'email', 'website', 'website:en', 'website:fr', 'website:de', 'website:kr', 'website:cn', 'operator:facebook', 'operator:twitter', 'website:menu', 'website:orders']
for code in shared.valid_country_codes():
@ -19,6 +21,7 @@ def expected_unique_keys():
returned.append("website:" + code)
return returned
def main():
"""
loads ATP files directly and detects some systematic issues
@ -44,6 +47,7 @@ def main():
reports = process_atp(atp_code, reports)
show_reports(reports)
def show_reports(reports):
#rich.print(reports['repeated_machine_readable_for_config_updates']) # TODO consider listing them?
for bad_image in reports['repeated_machine_readable_for_config_updates']['image']:
@ -58,7 +62,7 @@ def show_reports(reports):
print("Maybe `image` or `phone` and similar tags that is repeated over 10 times should be thrown out automatically? Without throwing it out manually by changing spider?\n\nNote that in case where former logos are replaced by actual images they would not continue to be thrown out.")
for key in reports:
banned = ['repeated_machine_readable_for_config_updates', 'repeated_for_atp_issue_tracker']
banned.append('whitespace_suffix_or_prefix_report') # see https://github.com/alltheplaces/alltheplaces/issues/11790
banned.append('whitespace_suffix_or_prefix_report') # see https://github.com/alltheplaces/alltheplaces/issues/11790
if config.allow_low_priority_atp_logging():
raise Exception("should I reeenable whitespace checks?")
if key not in banned:
@ -92,6 +96,7 @@ def record_bad_entry(key, value, counter_dict, examples_dict_of_lists):
examples_dict_of_lists[key] = []
examples_dict_of_lists[key].append(value)
def this_addr_full_is_like_street_address(value):
if qa.is_this_address_suspiciously_short_for_addr_full(value) == False:
return False
@ -101,7 +106,8 @@ def this_addr_full_is_like_street_address(value):
has_number = True
if has_number:
return True
return False # `addr:full=France` is a problem but a different one
return False # `addr:full=France` is a problem but a different one
def check_for_problems(reports, atp_data, atp_code):
repeated_key_check = {}
@ -157,7 +163,7 @@ def check_for_problems(reports, atp_data, atp_code):
record_bad_entry(key='addr:full', value=value, counter_dict=full_address_without_such_suspicion, examples_dict_of_lists=full_address_without_such_suspicion_value_examples)
else:
record_bad_entry(key='addr:full', value=value, counter_dict=full_address_with_unclassified_state, examples_dict_of_lists=full_address_with_unclassified_state_value_examples)
for key in repeated_key_check: # some values, like image=* are expected to be unique
for key in repeated_key_check: # some values, like image=* are expected to be unique
if key in tags:
value = tags[key]
try:
@ -185,20 +191,20 @@ def check_for_problems(reports, atp_data, atp_code):
continue
reports["repeated_for_atp_issue_tracker"][key] += "* [ ] " + key + " = " + value + " repeated " + str(repeated_key_check[key][value]) + " times in " + config.linkified_markdown_atp(atp_code) + "\n"
for key, count in keys_with_whitespace_suffix_or_prefix.items():
if count > threshold(key)/20: # really blatantly wrong
if count > threshold(key)/20: # really blatantly wrong
reports['whitespace_suffix_or_prefix_report'] += "* [ ] " + config.linkified_markdown_atp(atp_code) + " " + str(count) + " times has whitespace prefix/suffix, for " + key + " with values such as `" + whitespace_suffix_or_prefix_example[key] + "`\n"
for key, count in none_value.items():
if count > 0: # an obvious problem, always problematic
if count > 0: # an obvious problem, always problematic
reports['none_value_report'] += "* [ ] " + config.linkified_markdown_atp(atp_code) + " " + str(count) + " times has None value, for example for" + key + "`\n"
for key, count in numeric_not_string_value.items():
if count > 0: # an obvious problem, always problematic
if count > 0: # an obvious problem, always problematic
reports['numeric_not_string_value_report'] += "* [ ] " + config.linkified_markdown_atp(atp_code) + " " + str(count) + " times has non-string values (numbers/booleans etc), for " + key + " with values such as `" + str(numeric_not_string_value_example[key]) + "`\n"
for key, count in suspiciously_street_address_like_full_address.items():
if count == 0:
continue
nonsuspicious_count = full_address_without_such_suspicion.get(key, 0)
example_value = suspiciously_street_address_like_full_address_value_examples[key][0]
if count > nonsuspicious_count * 10: # of high importance for me, it breaks geocoding but many spiders have a very occasional breakage so not every single one should end there
if count > nonsuspicious_count * 10: # of high importance for me, it breaks geocoding but many spiders have a very occasional breakage so not every single one should end there
# see https://github.com/alltheplaces/alltheplaces/issues/11797
# does not make sense to report what appears only rarely, unless these reports are acted on
reports['suspiciously_short_addr:full'] += "* [ ] " + config.linkified_markdown_atp(atp_code) + " " + str(count) + " times has suspiciously short values, for " + key + " with example value `" + example_value + "`\n"
@ -222,6 +228,7 @@ def check_for_problems(reports, atp_data, atp_code):
)
return reports
def show_addr_values(atp_code, key, suspiciously_street_address_like_full_address_value_examples, full_address_without_such_suspicion_value_examples, full_address_with_unclassified_state_value_examples):
print()
print()
@ -233,4 +240,5 @@ def show_addr_values(atp_code, key, suspiciously_street_address_like_full_addres
print("unclassified")
print(full_address_with_unclassified_state_value_examples.get(key))
main()

View file

@ -1,9 +1,10 @@
import os
import serializing
import show_data
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import show_data
import serializing
import os
def multiplier(atp_code):
for part in atp_code.split("_"):
@ -13,24 +14,26 @@ def multiplier(atp_code):
return 10
return 1
def should_be_shown(atp_code, failed_matches, matched, cumulated_likelyhood_of_missing_from_osm):
if failed_matches + matched < 20:
return False # TODO, review also that
return False # TODO, review also that
if failed_matches > cumulated_likelyhood_of_missing_from_osm + matched * 10 * multiplier(atp_code):
# TODO lower requirements here a bit
return True
return False
def ignored_atp_codes():
return [
'mcdonalds_eg', # TODO delete on rerun
'mcdonalds_eg', # TODO delete on rerun
# reportedly mapped wrong in OSM
'unicredit_bulbank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11901#issuecomment-2577769704
'unicredit_bulbank_bg', # https://github.com/alltheplaces/alltheplaces/pull/11901#issuecomment-2577769704
# seems simply not mapped in OSM
'dhl_express_es', # though not sure what is going on
'pk_equipment', # https://www.openstreetmap.org/note/4581845
'dhl_express_es', # though not sure what is going on
'pk_equipment', # https://www.openstreetmap.org/note/4581845
'burger_king_eg',
'gap_trade_gb',
'mercy',
@ -51,7 +54,7 @@ def ignored_atp_codes():
'oggi_sorvetes_br',
'digi_telecommunications_my',
'agnvet_au',
'removery_us', # though also location accuracy is dubious
'removery_us', # though also location accuracy is dubious
'go_games_toys_us', # though also location accuracy is dubious
'loxam_fr',
'sunbelt_rentals_us_ca',
@ -60,9 +63,9 @@ def ignored_atp_codes():
'glasfit_za',
'mussala_bg',
'kpmg',
'hifi_corp', # also South Africa :)
'easybox_bg', # and they seem to use English language brand - https://sameday.bg/easybox/
'cashterminal_bg', # and they seem to use English language brand
'hifi_corp', # also South Africa :)
'easybox_bg', # and they seem to use English language brand - https://sameday.bg/easybox/
'cashterminal_bg', # and they seem to use English language brand
'manhattan_bagel',
'rogers_communications',
'completude',
@ -71,7 +74,7 @@ def ignored_atp_codes():
'euromobil_nl',
'side_step',
'billini_au',
'asics_us', # though also location accuracy is dubious
'asics_us', # though also location accuracy is dubious
'tui',
'dunkin_sa',
'reebok',
@ -117,6 +120,7 @@ def ignored_atp_codes():
'myhouse_au',
]
def how_likely_that_it_is_not_mapped(entry):
"""
this exists to reduce false positive ratio, and to guess cases which are likely simply not mapped
@ -174,17 +178,19 @@ def how_likely_that_it_is_not_mapped(entry):
if entry.atp_center['lon'] > 40:
# far Asia is even less mapped then Europe
probability += 0.1
probability = probability * 1.1 # reduce to further rescale, TODO_LOW_PRIORITY
probability = probability * 1.1 # reduce to further rescale, TODO_LOW_PRIORITY
if probability > 1:
probability = 1
return probability
def nothing_to_report_marker_filepath(atp_code):
folder = config.build_storage_folder() + "validation_runs/"
if os.path.isdir(folder) == False:
os.makedirs(folder)
return folder + "systematic_mismatch_not_triggered_" + atp_code + ".success"
area = graticule_report.global_graticule_coverage()
skipped_as_on_ignore_list_or_empty = []
for atp_code in obtain_atp_data.all_spider_codes_iterator():

View file

@ -1,10 +1,11 @@
import webbrowser
import os
import serializing
import show_data
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import show_data
import serializing
import os
import webbrowser
def main():
shown_spiders = []
@ -16,7 +17,7 @@ def main():
area = graticule_report.global_graticule_coverage()
for atp_code in obtain_atp_data.all_spider_codes_iterator():
if atp_code in [
]:
]:
continue
great_brand_match = 0
poor_brand_match = 0
@ -68,4 +69,5 @@ def main():
if len(shown_spiders) >= 30 and len(shown_spiders) % 10 == 0:
print(shown_spiders)
main()

View file

@ -1,5 +1,6 @@
obtain_atp_data = __import__("2_obtain_atp_data")
import qa
obtain_atp_data = __import__("2_obtain_atp_data")
def main():
for atp_code, parsed_content in obtain_atp_data.spider_codes_iterator_with_data():
@ -9,5 +10,6 @@ def main():
continue
qa.remove_bad_data(entry['properties'], atp_code)
if __name__ == "__main__":
main()

View file

@ -1,27 +1,30 @@
import matcher
import rich
import diskcache
import requests
import os
import serializing
import show_data
graticule_report = __import__("5_generate_graticule_reports")
obtain_atp_data = __import__("2_obtain_atp_data")
config = __import__("0_config")
import show_data
import serializing
import os
import requests
import diskcache
import rich
import matcher
# mismatches seem to be caused by
# duplicated wikidata entries (wikidata problem)
# editors changing just name tag, leaving brand, brand:wikidata etc in an inconsistent state
# - see https://github.com/orgs/organicmaps/discussions/10043 where I started attempt to report this bug
def cache_path():
return config.cache_folder() + 'wikidata_cache'
wikidata_cache = diskcache.Cache(cache_path(), eviction_policy="none")
print(len(wikidata_cache), "entries cached by Wikidata cache")
def get_wikidata_label(wikidata_id):
index = wikidata_id + "_label" # switch to using index
index = wikidata_id + "_label" # switch to using index
if index in wikidata_cache:
return wikidata_cache[index]
fetched = download_wikidata_label(wikidata_id)
@ -30,6 +33,7 @@ def get_wikidata_label(wikidata_id):
return "Label not found"
return fetched
def download_wikidata_label(wikidata_id):
url = "https://www.wikidata.org/w/api.php"
@ -56,6 +60,7 @@ def download_wikidata_label(wikidata_id):
else:
raise Exception("Error querying Wikidata:", response.status_code)
def get_wikidata_part_of(wikidata_id):
index = wikidata_id + "_part_of"
if index in wikidata_cache:
@ -64,6 +69,7 @@ def get_wikidata_part_of(wikidata_id):
wikidata_cache[index] = fetched
return fetched
def get_wikidata_owned_by(wikidata_id):
index = wikidata_id + "_owned_by"
if index in wikidata_cache:
@ -72,6 +78,7 @@ def get_wikidata_owned_by(wikidata_id):
wikidata_cache[index] = fetched
return fetched
def get_wikidata_parent_organization(wikidata_id):
index = wikidata_id + "_parent_organization"
if index in wikidata_cache:
@ -122,6 +129,7 @@ def download_wikidata_property(wikidata_id, property_id):
else:
raise Exception("Error querying Wikidata:", response.status_code)
def get_direct_parents(wikidata_id):
all_parents = []
all_parents += get_wikidata_part_of(wikidata_id)
@ -129,6 +137,7 @@ def get_direct_parents(wikidata_id):
all_parents += get_wikidata_parent_organization(wikidata_id)
return all_parents
def get_structure(wikidata_id):
direct_parents = get_direct_parents(wikidata_id)
# may have multiple steps...
@ -141,6 +150,7 @@ def get_structure(wikidata_id):
indirect += get_direct_parents(parent_id)
return direct_parents + indirect
def show_structure(wikidata_id):
parents = get_wikidata_part_of(wikidata_id)
if len(parents) > 0:
@ -160,6 +170,7 @@ def show_structure(wikidata_id):
for parent_wikidata_id in parents:
print(" ", get_wikidata_label(parent_wikidata_id), parent_wikidata_id)
def skipped_osm_cases():
return [
# fixed in OSM, TODO remove on rerun
@ -170,14 +181,15 @@ def skipped_osm_cases():
'https://www.openstreetmap.org/way/730418419',
# ones above are fixed in OSM, TODO remove on rerun
'https://www.openstreetmap.org/node/11932473689', # see https://www.openstreetmap.org/changeset/151797641
'https://www.openstreetmap.org/way/687923205', # https://www.openstreetmap.org/changeset/157497573
'https://www.openstreetmap.org/node/11932473689', # see https://www.openstreetmap.org/changeset/151797641
'https://www.openstreetmap.org/way/687923205', # https://www.openstreetmap.org/changeset/157497573
]
area = graticule_report.global_graticule_coverage()
print(area)
matching_via_parentage = 0
for atp_code in obtain_atp_data.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
for atp_code in obtain_atp_data.all_spider_codes_iterator(): # note, data is fetched from matches which may be based on a different version of ATP, with different spiders being broken or some missing/not yet present. But looking at files directly seems to be an overkill for such diagnosis tool.
for lat_anchor in range(area['min_lat'], area['max_lat']):
for lon_anchor in range(area['min_lon'], area['max_lon']):
#print(atp_code, lat_anchor, lon_anchor)
@ -249,6 +261,7 @@ for atp_code in obtain_atp_data.all_spider_codes_iterator(): # note, data is fet
rich.print(entry.atp_tags)
print("OSM")
rich.print(entry.osm_match_tags)
def package_tags_into_mock(tags):
return {'tags': tags, 'center': {'lat': 0, 'lon': 0}, 'osm_link': 'dummy'}
atp_data = [package_tags_into_mock(entry.atp_tags)]

View file

@ -1,5 +1,6 @@
config = __import__("0_config")
from collections import Counter
config = __import__("0_config")
def main():
with open(config.poi_type_conflict_list_skipping_obvious_mismatches_with_potential_false_conflicts(), 'r') as infile:
@ -11,8 +12,6 @@ def main():
print(line)
print(len(unique_lines), "unique lines")
line_counts = Counter(lines)
filtered_lines = {
@ -27,6 +26,7 @@ def main():
for line, count in sorted_lines:
print(f"\n{line.strip()}\nappears {count} times.\n")
def is_conflict_handled(line):
if line[1:-2] in config.clear_type_conflicts():
return True
@ -36,4 +36,5 @@ def is_conflict_handled(line):
return True
return False
main()

View file

@ -1,25 +1,25 @@
import rich
import json
config = __import__("0_config")
obtain_atp_data = __import__("2_obtain_atp_data")
import json
import rich
reported = {}
def log_if_unhandled_cascading_found(tags):
shop_value = tags.get("shop")
if shop_value in [
"fuel", # see test_do_not_distinguish_between_various_fuel_shops
"fuel", # see test_do_not_distinguish_between_various_fuel_shops
]:
return
if shop_value == "clothes" and tags.get("clothes") in ["men", "women"]:
# see test_do_not_use_clothes_key_for_usual_clothes_shops
return
if shop_value == "trade" and tags.get("trade") in [
"electrical", "tiles", # see test_group_electrical_shops_tagged_in_a_different_way
]:
"electrical", "tiles", # see test_group_electrical_shops_tagged_in_a_different_way
]:
return
if shop_value != None:
if tags.get(shop_value) != None:
report = "shop = " + shop_value + " " + shop_value + " = " + tags.get(shop_value)
@ -28,6 +28,7 @@ def log_if_unhandled_cascading_found(tags):
reported[report] = 0
reported[report] += 1
def main():
for atp_code in obtain_atp_data.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
@ -46,4 +47,5 @@ def main():
raise e
rich.print(reported)
main()

View file

@ -1,3 +1,7 @@
import qa
import random
import shared
import matcher
import rich
import osm_bot_abstraction_layer.util_download_file
import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge
@ -6,10 +10,7 @@ import os
import requests
import functools
config = __import__("0_config")
import matcher
import shared
import random
import qa
def main():
download_entire_atp_dataset()
@ -48,7 +49,7 @@ def find_missing_listing_of_commonly_shared_name_parts():
print(part, spider_list)
found_count += 1
if found_count > 0:
raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess
raise Exception("look at common_shared_name_parts()") # TODO move this verification to a separate script, I guess
def remove_country_codes_from_spider_code(atp_code):
@ -59,7 +60,6 @@ def remove_country_codes_from_spider_code(atp_code):
return "_".join(returned_parts)
def do_not_remind_that_this_tagging_may_be_worth_supporting():
# maybe move it to 0_config ?
# to make it more findable ?
@ -100,9 +100,9 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting():
# look into this # TODO
notified_about_tag['shop'].append('grocery')
notified_about_tag['shop'].append('grocer')
notified_about_tag['landuse'].append('residential') # looks like ATP is silly here, or data is unusable
notified_about_tag['landuse'].append('residential') # looks like ATP is silly here, or data is unusable
notified_about_tag['shop'].append('truck_parts') # seems to be from NSI? https://wiki.openstreetmap.org/wiki/Tag:shop%3Dtruck_parts has no page
notified_about_tag['amenity'].append('marketplace') # may be too large, though also mappable as a point...
notified_about_tag['amenity'].append('marketplace') # may be too large, though also mappable as a point...
# kind also shoplike? I want to support them
# requires change to osm_bot_abstraction_layer.tag_knowledge.is_shoplike
@ -161,9 +161,9 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting():
notified_about_tag['craft'].append('brewery')
notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office
notified_about_tag['amenity'].append('mortuary')
notified_about_tag['amenity'].append('mailroom') # often an internal facility
notified_about_tag['amenity'].append('mailroom') # often an internal facility
notified_about_tag['man_made'].append('works')
notified_about_tag['telecom'].append('data_center') # internal facilities
notified_about_tag['telecom'].append('data_center') # internal facilities
# DO NOT WADE INTO THIS TAGGING MESS WITH THIS TOOL!
# and anyway is likely not in iD presets so would not be supported anyway
@ -205,7 +205,7 @@ def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, notified_ab
if key in entry['properties']:
value = entry['properties'][key]
if value in ["yes", "no"]:
continue # used as a property, not as a main tag
continue # used as a property, not as a main tag
if ";" in value:
# see https://github.com/alltheplaces/alltheplaces/pull/11608#issuecomment-2585053764
# see https://github.com/alltheplaces/alltheplaces/pull/11942
@ -246,7 +246,7 @@ def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, notified_ab
return notified_about_tag
def spider_codes_iterator(): # TODO: try to deprecate as it is extremely slow - adds several seconds on first use which loads and parses entire ATP dataset
def spider_codes_iterator(): # TODO: try to deprecate as it is extremely slow - adds several seconds on first use which loads and parses entire ATP dataset
for entry in spider_codes_check_for_valid_data():
yield entry
@ -258,6 +258,7 @@ def spider_codes_check_for_valid_data():
returned.append(code)
return returned
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
"""
this one is not parsing .geojson files so will be faster
@ -281,10 +282,12 @@ def spider_codes_and_filepaths_iterator_including_broken_data_ones():
continue
yield item_path, atp_code
def all_spider_codes_iterator():
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
yield atp_code
def spider_codes_iterator_with_data():
for item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
with open(item_path, 'r') as file:

View file

@ -1,6 +1,7 @@
import os
import shared
config = __import__("0_config")
import os
def delete_requests_to_fetch_data_requested_by_processing():
# check links or run nominatim requests
@ -11,20 +12,24 @@ def delete_requests_to_fetch_data_requested_by_processing():
if os.path.isfile(config.nominatim_structured_requests_missing_from_cache()):
os.remove(config.nominatim_structured_requests_missing_from_cache())
def delete_output_files():
if os.path.isdir(config.output_folder()):
shared.delete_files_in_folder(config.output_folder())
if os.path.isdir(config.published_output_folder()):
shared.delete_files_in_folder(config.published_output_folder())
def delete_build_files():
if os.path.isdir(config.build_storage_folder()):
shared.delete_nested_files_folders_in_folder(config.build_storage_folder())
def main():
delete_requests_to_fetch_data_requested_by_processing()
delete_output_files()
delete_build_files()
if __name__ == "__main__":
main()

View file

@ -1,3 +1,6 @@
import rich
import show_data
import matcher
import url_checker
import shared
import json
@ -7,9 +10,6 @@ import os
import math
import shutil
obtain_atp_data = __import__("2_obtain_atp_data")
import matcher
import show_data
import rich
process_planet = __import__("4_process_planet_file")
config = __import__("0_config")
@ -17,10 +17,12 @@ config = __import__("0_config")
def graticule_id(lat, lon, lat_span, lon_span, margin_in_kilometers):
return str(lat) + "_" + str(lon) + "_x_" + str(lat_span) + "_" + str(lon_span) + "|" + str(margin_in_kilometers)
def global_graticule_coverage():
#return {'min_lat': 50, 'min_lon': 20, 'max_lat': 51, 'max_lon': 21}
return {'min_lat': -84, 'min_lon': -180, 'max_lat': 84, 'max_lon': 180}
def main():
check_is_any_graticule_having_margin_greater_than_entire_graticule()
# global coverage run
@ -50,15 +52,19 @@ def graticule_cache(area):
# such separation allows to mark given generated dataset as done and do not regenerate it again
return config.build_storage_folder() + "per_graticule_data/" + area_text_identifier(area) + "/"
def specific_graticule_cache_for_report_success(area, lat, lon):
return graticule_cache(area) + "report_success_marker/" + str(lat) + " lat/" + str(lon) + " lon/"
def specific_graticule_cache_for_atp_osm_input(area, lat, lon):
return graticule_cache(area) + "osm_atp_split_by_graticule/" + str(lat) + " lat/" + str(lon) + " lon/"
def specific_graticule_cache_for_match_lists(area, lat, lon):
return graticule_cache(area) + "match_lists/" + str(lat) + " lat/" + str(lon) + " lon/"
def area_name_for_graticule(lat_anchor, lon_anchor):
return str(lat_anchor) + " " + str(lon_anchor)
@ -137,6 +143,7 @@ def prepare_graticule_data_files(graticule_coverage):
myfile.write("data prepared")
print("split data across graticules")
def prepare_osm_graticule_files(graticule_coverage):
"""
list OSM data in its graticule and surrounding ones
@ -172,6 +179,7 @@ def prepare_osm_graticule_files(graticule_coverage):
offset_lon = 179
add_entry_to_graticule_file(entry, 'osm', offset_lat, offset_lon, graticule_coverage)
def prepare_atp_graticule_files(graticule_coverage):
for atp_code in obtain_atp_data.all_spider_codes_iterator():
if atp_code in config.ignored_atp_codes():
@ -189,7 +197,7 @@ def prepare_atp_graticule_files(graticule_coverage):
# expensive qa check for any ATP entry across the world
cleaned_atp = matcher.clean_atp_entry_with_procesed_geometry(atp, atp_code)
if cleaned_atp == None:
continue # failed to pass qa
continue # failed to pass qa
try:
add_entry_to_graticule_file(cleaned_atp, 'atp', lat_floor, lon_floor, graticule_coverage)
except FileNotFoundError:
@ -198,6 +206,7 @@ def prepare_atp_graticule_files(graticule_coverage):
print("entry was accepted as within range, then it crashed on adding to file")
raise
def specific_file(origin, lat, lon, area):
# TODO: better function name
"""
@ -264,6 +273,7 @@ def prepare_graticule_coverage_map(graticule_coverage):
shutil.copy(source, destination)
return graticule_index_filename_output
def generate_graticule_coverage_map(graticule_coverage, graticule_index_path):
with open(graticule_index_path, 'w') as outfile:
area = graticule_coverage
@ -289,6 +299,7 @@ def generate_graticule_coverage_map(graticule_coverage, graticule_index_path):
outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link=file))
outfile.write(leafleter.generator.get_html_page_suffix())
def osm_link_locator(lat, lon):
return "https://www.openstreetmap.org/?mlat=" + str(lon) + "&mlon=" + str(lat) + "#map=13/" + str(lon) + "/" + str(lat) + ""

View file

@ -1,12 +1,11 @@
import os
import matplotlib.pyplot as plt
import warnings
# hide following message:
# UserWarning: Unable to import Axes3D. This may be due to multiple versions of Matplotlib being installed (e.g. as a system package and as a pip package). As a result, the 3D projection is not available.
# TODO: fix it properly
warnings.filterwarnings("ignore", category=UserWarning, message=".*Axes3D.*")
import matplotlib.pyplot as plt
import os
class MatchDistanceDestributionReportCreator:
def __init__(self, identifier, area_name):

View file

@ -5,6 +5,7 @@ from dulwich.objects import Commit
from datetime import datetime
import time
def get_latest_commit_date_for_file(repo_path, file_path):
r = Repo(repo_path)
p = b"the/file/to/look/for"

View file

@ -77,10 +77,10 @@ def common_shared_name_parts():
# in English
"furniture", "pharmacy", "store", "shop", 'cafe', "bar", "company", 'storage',
'opticians', 'jewelers', 'bakery',
"car", # to prevent matching unrelated car rentals
"car", # to prevent matching unrelated car rentals
"burger", "kitchen", 'paper',
'self', 'storage', # "Self Storage" is common in names
'self', 'storage', # "Self Storage" is common in names
'house',
@ -101,6 +101,7 @@ def get_name_sources(atp_tags):
name_sources.append(short_name)
return name_sources
def get_filter_names_from_atp_dataset(current_atp):
filter_names = []
for atp in current_atp:
@ -143,6 +144,7 @@ def entries_in_range(osm_index, distance_scan_in_kilometers, atp):
):
yield osm
def get_matches(osm_data, atp_data):
match_list = []
filter_names = get_filter_names_from_atp_dataset(atp_data)
@ -199,12 +201,14 @@ def load_atp_without_qa(atp_code):
return []
return load_atp_from_json_without_qa(data, atp_code)
def load_and_clean_atp(atp_code):
data = open_atp_file(atp_code)
if data == None:
return []
return load_atp_from_json_and_clean_it(data, atp_code)
def open_atp_file(atp_code):
filename = config.atp_unpacked_folder() + atp_code + '.geojson'
if os.path.isfile(filename) == False:
@ -218,6 +222,7 @@ def open_atp_file(atp_code):
# no need to report also here, so lets fail silently
return None
def load_atp_from_json_without_qa(data, atp_code):
returned = []
for entry in data['features']:
@ -239,6 +244,7 @@ def load_atp_from_json_and_clean_it(data, atp_code):
returned.append(clean_atp_entry_with_procesed_geometry(atp, atp_code))
return returned
def clean_atp_entry_with_procesed_geometry(atp, atp_code):
if atp['center'] == None:
# normal, especially as ATP may suppress blatantly wrong locations
@ -251,6 +257,7 @@ def clean_atp_entry_with_procesed_geometry(atp, atp_code):
return None
return atp
def is_location_clearly_implausible(object_data, center):
if "addr:country" in object_data:
if object_data["addr:country"].lower() not in shared.valid_country_codes():

View file

@ -11,6 +11,7 @@ config = __import__("0_config")
def cache_path():
return config.cache_folder() + 'nominatim_cache'
nominatim_cache = diskcache.Cache(cache_path(), eviction_policy="none")
print(len(nominatim_cache), "entries cached by Nominatim cache")

38
qa.py
View file

@ -4,6 +4,7 @@ import shops
import shared
config = __import__("0_config")
def remove_bad_data(data, atp_code):
"""
removes bad data: dubious, broken and low quality tags
@ -19,6 +20,7 @@ def remove_bad_data(data, atp_code):
config.show_info_about_spider_to_debug_it(atp_code)
raise
def remove_bad_data_wrapped(data, atp_code):
for key in list(data.keys()):
if data[key] == None:
@ -42,7 +44,6 @@ def remove_bad_data_wrapped(data, atp_code):
if key in data:
del data[key]
data = remove_whitespace_suffix_prefix(data, atp_code)
data = remove_or_fix_bad_links(data, atp_code)
@ -100,6 +101,7 @@ def remove_bad_data_wrapped(data, atp_code):
return data
def handle_type_keys(data, atp_code):
data = remove_type_keys_where_it_is_a_duplicate(data, atp_code)
@ -117,6 +119,7 @@ def handle_type_keys(data, atp_code):
return None
return data
def remove_fields_or_entire_elements_on_html_weird_characters_or_empty_or_similar_bad_fields(data, atp_code):
for key in list(data.keys()):
value = data.get(key)
@ -126,7 +129,7 @@ def remove_fields_or_entire_elements_on_html_weird_characters_or_empty_or_simila
return None
for banned_character in "<>[]{}?=":
if banned_character in value:
internal_atp = ['@source_uri', 'atp_ref'] # atp_ref is set by handle_ref_tag
internal_atp = ['@source_uri', 'atp_ref'] # atp_ref is set by handle_ref_tag
if key in (config.keys_with_value_link() + internal_atp) and banned_character in ["?", "="]:
continue
if config.is_bogus_key_worth_mentioning(key, atp_code):
@ -144,7 +147,7 @@ def remove_fields_or_entire_elements_on_html_weird_characters_or_empty_or_simila
continue
if "undefined" in value.lower():
if len(value) > 20 and key in [
'@source_uri', # @source_uri = https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Stores-FindStores?hasCondition=false&hasVariantsAvailableForLookup=false&hasVariantsAvailableForPickup=false&source=plp&showMap=false&products=undefined:1
'@source_uri', # @source_uri = https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Stores-FindStores?hasCondition=false&hasVariantsAvailableForLookup=false&hasVariantsAvailableForPickup=false&source=plp&showMap=false&products=undefined:1
]:
continue
if config.is_bogus_key_worth_mentioning(key, atp_code):
@ -153,6 +156,7 @@ def remove_fields_or_entire_elements_on_html_weird_characters_or_empty_or_simila
continue
return data
def is_object_actually_not_open_at_all(data, atp_code):
if data.get("opening_hours") == "Mo-Su closed":
# not an actual active shop, marked in highly cryptic way
@ -165,8 +169,8 @@ def is_object_actually_not_open_at_all(data, atp_code):
# as they are already scheduled to disappear, so what is the point of adding them?
return True
for indicator in ["store is opening", "permanently closed", "< closed", "> closed", "service areas only",
'temporarily closed', 'Book now', ' opens ', # https://github.com/alltheplaces/alltheplaces/issues/11868
]:
'temporarily closed', 'Book now', ' opens ', # https://github.com/alltheplaces/alltheplaces/issues/11868
]:
# 'addr:street_address': 'This store is opening September/October 2024'
# https://github.com/alltheplaces/alltheplaces/issues/9055
# https://github.com/alltheplaces/alltheplaces/issues/11199
@ -185,6 +189,7 @@ def is_object_actually_not_open_at_all(data, atp_code):
return True
return False
def remove_unwanted_object_types(data, atp_code):
if atp_code == "woolworths_au":
if data.get("type") in ["PETROL", "AMPOL", "CALTEXWOW"]:
@ -195,7 +200,7 @@ def remove_unwanted_object_types(data, atp_code):
if data.get("amenity") == "vending_machine":
if "vending" not in data:
if atp_code not in [
'tymebank_za', # https://github.com/alltheplaces/alltheplaces/pull/10180#issue-2514530869 - ATPTODO
'tymebank_za', # https://github.com/alltheplaces/alltheplaces/pull/10180#issue-2514530869 - ATPTODO
]:
print()
print("amenity = vending_machine without vending tag, skipping it")
@ -208,7 +213,7 @@ def remove_unwanted_object_types(data, atp_code):
if data.get("post_office") == "post_partner":
# If it is a post partner, then it is not a post office but just an additional aspect/facet of another POI
if config.allow_very_low_priority_atp_logging():
if atp_code != 'midcounties_cooperative_gb': # see https://github.com/alltheplaces/alltheplaces/pull/11944
if atp_code != 'midcounties_cooperative_gb': # see https://github.com/alltheplaces/alltheplaces/pull/11944
print()
print("amenity=post_office post_office=post_partner - skipping this entry")
rich.print(data)
@ -247,6 +252,7 @@ def remove_unwanted_object_types(data, atp_code):
return None
return data
def remove_whitespace_suffix_prefix(data, atp_code):
for key in list(data.keys()):
if data[key].strip() != data[key]:
@ -271,6 +277,7 @@ def remove_whitespace_suffix_prefix(data, atp_code):
del data[key]
return data
def remove_or_fix_bad_links(data, atp_code):
if "website_2" in data and "website" not in data:
if config.is_bogus_key_worth_mentioning("website_2", atp_code):
@ -313,6 +320,7 @@ def remove_or_fix_bad_links(data, atp_code):
del data[key]
return data
def remove_bad_address(data, atp_code):
field = 'addr:street_address'
if field in data:
@ -381,6 +389,7 @@ def remove_bad_address(data, atp_code):
data = remove_duplicated_state_country_fields(data, atp_code)
return data
def is_this_address_suspiciously_short_for_addr_full(value):
if "," in value:
return False
@ -397,6 +406,7 @@ def is_this_address_suspiciously_short_for_addr_full(value):
value = value.replace(entry, "")
return len(value.strip()) < 14
def remove_duplicated_state_country_fields(data, atp_code):
if "addr:state" not in data:
return data
@ -405,8 +415,8 @@ def remove_duplicated_state_country_fields(data, atp_code):
if data["addr:state"] != data["addr:country"]:
return data
if data["addr:state"] in [
"PR", # Puerto Rico, it seems to be valid there!
"IM", # Isle of Man
"PR", # Puerto Rico, it seems to be valid there!
"IM", # Isle of Man
]:
return data
if config.is_bogus_key_worth_mentioning("addr:state", atp_code):
@ -419,6 +429,7 @@ def remove_duplicated_state_country_fields(data, atp_code):
del data["addr:state"]
return data
def throw_away_name_from_atp(data, atp_code):
if data.get("brand") == data["name"]:
# no need to do this if name matches brand anyway
@ -444,6 +455,7 @@ def throw_away_name_from_atp(data, atp_code):
# many many such cases, see atp_listing_name-based reports
return True
def handle_name_and_brand_tags(data, atp_code):
if "brand" not in data:
if config.is_missing_brand_field_worth_mentioning(atp_code):
@ -475,6 +487,7 @@ def handle_name_and_brand_tags(data, atp_code):
data["name"] = data["brand"]
return data
def remove_type_keys_where_it_is_a_duplicate(data, atp_code):
for type_key in config.generic_type_keys():
# these could be reported back to ATP, I guess
@ -497,8 +510,8 @@ def remove_type_keys_where_it_is_a_duplicate(data, atp_code):
del data["type"]
elif atp_code == "nike":
if data.get("type") in [
"BEACON", # maybe indicates franchise?
]:
"BEACON", # maybe indicates franchise?
]:
del data["type"]
elif atp_code == "department_veterans_affairs":
if data.get("type") in ["facility"]:
@ -522,6 +535,7 @@ def remove_type_keys_where_it_is_a_duplicate(data, atp_code):
del data[key] # an useless duplicate
return data
def remove_bad_email_data(data, atp_code):
if 'email' not in data:
return data
@ -796,7 +810,7 @@ def is_empty_value(key, value, atp_code):
return True
if value.lower() == "na":
if key == "addr:country":
return False # Namibia
return False # Namibia
return True
useless_values = {
# see now fixed https://github.com/alltheplaces/alltheplaces/issues/8978

View file

@ -29,16 +29,19 @@ def commit_file(repo_path, file_path, author, commit_message):
dulwich.porcelain.add(repo=repo, paths=[file_path])
dulwich.porcelain.commit(repo=repo, message=commit_message.encode(), author=author.encode())
def push_changes():
command = 'cd "' + config.repo_location_atp() + '" && git push my_repo'
print(command)
os.system(command) #dulwich.porcelain.push is unable to use credentials
os.system(command) # dulwich.porcelain.push is unable to use credentials
def reset_active_work_in_repo(repo_path):
repo = Repo(repo_path)
dulwich.porcelain.update_head(repo, 'master', detached=False)
dulwich.porcelain.reset(repo, 'hard', 'HEAD')
def share_changes(repo_path, spider_filepath, atp_code, value_examples):
branch_name = atp_code + "_suspiciously_short_addrfull"
checkout_branch(repo_path, branch_name)
@ -47,6 +50,7 @@ def share_changes(repo_path, spider_filepath, atp_code, value_examples):
webbrowser.open("https://github.com/alltheplaces/alltheplaces/compare/master...matkoniecz:" + branch_name + "?expand=1")
reset_active_work_in_repo(repo_path)
def fix_addr_street_set_as_addr_full(atp_code, value_examples, counter_examples):
repo_path = config.repo_location_atp()

View file

@ -5,6 +5,7 @@ import webbrowser
import urllib
import regex
def has_japanese_or_chinese_or_korean_or_arabic_text(value):
# https://stackoverflow.com/a/66601628/4130619
# https://stackoverflow.com/a/30100900/4130619
@ -13,6 +14,7 @@ def has_japanese_or_chinese_or_korean_or_arabic_text(value):
return True
return False
def link_to_point_in_osm(lat, lon):
return 'https://www.openstreetmap.org/?mlat=' + str(lat) + "&mlon=" + str(lon) + "#map=19/" + str(lat) + '/' + str(lon)
@ -30,6 +32,7 @@ def delete_files_in_folder(folder):
if os.path.isfile(file_path):
os.remove(file_path)
def delete_nested_files_folders_in_folder(folder):
for filename in os.listdir(folder):
item_path = os.path.join(folder, filename)
@ -42,6 +45,7 @@ def delete_nested_files_folders_in_folder(folder):
delete_nested_files_folders_in_folder(item_path)
os.rmdir(item_path)
def get_free_space_in_mb(path):
_total, _used, free = shutil.disk_usage(path)
return free / 1024 / 1024
@ -51,6 +55,7 @@ def open_prepared_issue_form(title, body):
url = f'https://github.com/alltheplaces/alltheplaces/issues/new?title={urllib.parse.quote(title)}&body={urllib.parse.quote(body)}'
webbrowser.open(url)
def valid_country_codes():
# NSI lists only part of them
# https://github.com/alltheplaces/alltheplaces/blob/8c28db93cb6df154ae3a4651b57d175e272ae416/ci/check_spider_naming_consistency.py#L9
@ -308,6 +313,7 @@ def valid_country_codes():
"zw",
]
def country_data():
return {
'Poland': {

View file

@ -552,7 +552,6 @@ class NominatimMismatchReportCreator:
</body>
</html>"""
def generate_geojson_report(self):
with open(config.output_folder() + self.output_geojson_file(), 'w') as f:
json.dump(serializing.generate_geojson_structure(self.only_atp_match_list()), f)
@ -725,7 +724,6 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl
}
def get_center(dataset):
max_lat = -90
max_lon = -180

View file

@ -1,6 +1,7 @@
import unittest
config = __import__("0_config")
class ConfigTests(unittest.TestCase):
def test_mathworks(self):
self.assertEqual(2 + 1, 3)

View file

@ -315,12 +315,12 @@ website = http://www.dunkindonuts.sa"""
'name': 'Avia XPress',
'website': 'https://avia.nl/tankstations/avia-xpress-nieuw-weerdinge',
'brand': 'Avia XPress',
'brand:wikidata': 'Q124611203', # AVIA XPress, P127 set (owned by AVIA International Q300147)
'brand:wikidata': 'Q124611203', # AVIA XPress, P127 set (owned by AVIA International Q300147)
}
osm_tags = {
'amenity': 'fuel',
'brand': 'Avia',
'brand:wikidata': 'Q300147', # AVIA International
'brand:wikidata': 'Q300147', # AVIA International
'name': 'Avia',
}
self.assertEqual(self.this_tag_lists_match(atp_tags, osm_tags), True)
@ -332,16 +332,17 @@ website = http://www.dunkindonuts.sa"""
'name': 'Tamoil express',
'website': 'https://avia.nl/tankstations/tamoil-oegstgeest',
'brand': 'Tamoil express',
'brand:wikidata': 'Q124658477', # Tamoil express, P127 set (owned by Tamoil Q706793)
'brand:wikidata': 'Q124658477', # Tamoil express, P127 set (owned by Tamoil Q706793)
}
osm_tags = {
'amenity': 'fuel',
'brand': 'Tamoil',
'brand:wikidata': 'Q706793', # Tamoil
'brand:wikidata': 'Q706793', # Tamoil
'name': 'Tamoil',
}
self.assertEqual(self.this_tag_lists_match(atp_tags, osm_tags), True)
class CanonicalValueTests(unittest.TestCase):
def test_simple_canonical_value(self):
self.assertEqual(config.canonical_feature({'shop': 'butcher'}), "shop=butcher")
@ -389,15 +390,15 @@ class CanonicalValueTests(unittest.TestCase):
def test_group_electrical_shops_tagged_in_a_different_way(self):
pass
#TODO - actually handle this
#self.assertEqual(config.the_same_feature_type({'shop': 'electrical'}, {'shop': 'electric'}), True)
#self.assertEqual(config.the_same_feature_type({'shop': 'trade', 'trade': 'electrical'}, {'shop': 'electrical'}), True)
# TODO - actually handle this
# self.assertEqual(config.the_same_feature_type({'shop': 'electrical'}, {'shop': 'electric'}), True)
# self.assertEqual(config.the_same_feature_type({'shop': 'trade', 'trade': 'electrical'}, {'shop': 'electrical'}), True)
def test_group_building_materials_shops_tagged_in_a_different_way(self):
pass
#TODO - actually handle this
#self.assertEqual(config.the_same_feature_type({'shop': 'trade', 'trade': 'building_materials'}, {'shop': 'building_materials'}), True)
#self.assertEqual(config.the_same_feature_type({'shop': 'trade', 'trade': 'tiles'}, {'shop': 'tiles'}), True)
# TODO - actually handle this
# self.assertEqual(config.the_same_feature_type({'shop': 'trade', 'trade': 'building_materials'}, {'shop': 'building_materials'}), True)
# self.assertEqual(config.the_same_feature_type({'shop': 'trade', 'trade': 'tiles'}, {'shop': 'tiles'}), True)
def test_do_not_use_clothes_key_for_usual_clothes_shops(self):
self.assertEqual(config.the_same_feature_type({'shop': 'clothes', 'clothes': 'men'}, {'shop': 'clothes', 'clothes': 'women'}), True)
@ -411,6 +412,7 @@ class CanonicalValueTests(unittest.TestCase):
# TODO: deal with shop=trade trade=
# TODO: see 19_detect_unhandled_cascading_values_for_canonical_poi_types.py
class TestSupportFunctions(unittest.TestCase):
def test_null_island_is_not_in_poland(self):
self.assertEqual(True, matcher.is_location_clearly_implausible({"addr:country": "PL"}, {'lat': 0, "lon": 0}))

View file

@ -70,50 +70,50 @@ class ProcessingTests(unittest.TestCase):
def test_useless_store_type_key_removal_example(self):
data = {
'ref': '10966',
'delivery': 'yes',
'storeType': 'pret', # to be removed
'wheelchair': 'no',
'internet_access': 'wlan',
'@source_uri': 'https://api1.pret.com/v1/shops',
'@spider': 'pret_a_manger',
'amenity': 'fast_food',
'cuisine': 'sandwich',
'short_name': 'Pret',
'takeaway': 'yes',
'addr:street_address': 'Vault 1 Station Building, Park Place, Park Street Upper',
'addr:city': 'Dublin',
'addr:postcode': 'D02R2H5',
'addr:country': 'IE',
'name': 'Hatch Street',
'phone': '+353 1 517 0158',
'brand': 'Pret A Manger',
'brand:wikidata': 'Q2109109',
'nsi_id': 'pretamanger-4f61b1',
'opening_hours_in_atp_format': 'Mo-Fr 07:00-18:00; Sa 08:00-18:00; Su 09:00-18:00'
}
'ref': '10966',
'delivery': 'yes',
'storeType': 'pret', # to be removed
'wheelchair': 'no',
'internet_access': 'wlan',
'@source_uri': 'https://api1.pret.com/v1/shops',
'@spider': 'pret_a_manger',
'amenity': 'fast_food',
'cuisine': 'sandwich',
'short_name': 'Pret',
'takeaway': 'yes',
'addr:street_address': 'Vault 1 Station Building, Park Place, Park Street Upper',
'addr:city': 'Dublin',
'addr:postcode': 'D02R2H5',
'addr:country': 'IE',
'name': 'Hatch Street',
'phone': '+353 1 517 0158',
'brand': 'Pret A Manger',
'brand:wikidata': 'Q2109109',
'nsi_id': 'pretamanger-4f61b1',
'opening_hours_in_atp_format': 'Mo-Fr 07:00-18:00; Sa 08:00-18:00; Su 09:00-18:00'
}
expected_new_data = {
'ref': '10966',
'delivery': 'yes',
'wheelchair': 'no',
'internet_access': 'wlan',
'@source_uri': 'https://api1.pret.com/v1/shops',
'@spider': 'pret_a_manger',
'amenity': 'fast_food',
'cuisine': 'sandwich',
'short_name': 'Pret',
'takeaway': 'yes',
'addr:street_address': 'Vault 1 Station Building, Park Place, Park Street Upper',
'addr:city': 'Dublin',
'addr:postcode': 'D02R2H5',
'addr:country': 'IE',
'name': 'Hatch Street',
'phone': '+353 1 517 0158',
'brand': 'Pret A Manger',
'brand:wikidata': 'Q2109109',
'nsi_id': 'pretamanger-4f61b1',
'opening_hours_in_atp_format': 'Mo-Fr 07:00-18:00; Sa 08:00-18:00; Su 09:00-18:00'
}
'ref': '10966',
'delivery': 'yes',
'wheelchair': 'no',
'internet_access': 'wlan',
'@source_uri': 'https://api1.pret.com/v1/shops',
'@spider': 'pret_a_manger',
'amenity': 'fast_food',
'cuisine': 'sandwich',
'short_name': 'Pret',
'takeaway': 'yes',
'addr:street_address': 'Vault 1 Station Building, Park Place, Park Street Upper',
'addr:city': 'Dublin',
'addr:postcode': 'D02R2H5',
'addr:country': 'IE',
'name': 'Hatch Street',
'phone': '+353 1 517 0158',
'brand': 'Pret A Manger',
'brand:wikidata': 'Q2109109',
'nsi_id': 'pretamanger-4f61b1',
'opening_hours_in_atp_format': 'Mo-Fr 07:00-18:00; Sa 08:00-18:00; Su 09:00-18:00'
}
new_data = qa.remove_type_keys_where_it_is_a_duplicate(data, 'pret_a_manger')
self.assertEqual(new_data, expected_new_data)
@ -122,35 +122,35 @@ class ProcessingTests(unittest.TestCase):
def test_crash(self):
# to prevent regressions
data = {
'ref': '49418',
'delivery': 'yes',
'addr:city:en': 'HAERBIN',
'addr:city:zh': '哈尔滨市',
'addr:street_address:en': 'NO.299,Haxi Str.,Nangang District,Harbin\n',
'addr:street_address:zh': '黑龙江省哈尔滨市',
'addr:full:en': 'NO.299, Haxi Str., Nangang District, Harbin, Heilongjiang Province Harbin, HAERBIN, 150000',
'addr:full:zh': '黑龙江省哈尔滨市, 哈尔滨市南岗区哈西大街299号哈西红博购物广场一层F1023单元, 哈尔滨市, 150000',
'branch:en': 'Harbin Haxi Hongbo Store',
'branch:zh': '哈尔滨西城红场店',
'@source_uri': 'https://www.starbucks.com.cn/api/stores/nearby?lat=46.0310334&lon=127.6518582&limit=1000&locale=ZH&features=&radius=100000',
'amenity': 'cafe',
'cuisine': 'coffee_shop',
'@spider': 'starbucks_cn',
'brand:en': 'Starbucks',
'brand:zh': '星巴克',
'name:en': 'Starbucks',
'name:zh': '星巴克',
'takeaway': 'yes',
'addr:full': '黑龙江省哈尔滨市, 哈尔滨市南岗区哈西大街299号哈西红博购物广场一层F1023单元, 哈尔滨市, 150000',
'addr:street_address': '黑龙江省哈尔滨市',
'addr:city': '哈尔滨市',
'addr:postcode': '150000',
'addr:country': 'CN',
'name': '星巴克',
'branch': '哈尔滨西城红场店',
'brand': '星巴克',
'brand:wikidata': 'Q37158',
'nsi_id': 'starbucks-823e31'
'ref': '49418',
'delivery': 'yes',
'addr:city:en': 'HAERBIN',
'addr:city:zh': '哈尔滨市',
'addr:street_address:en': 'NO.299,Haxi Str.,Nangang District,Harbin\n',
'addr:street_address:zh': '黑龙江省哈尔滨市',
'addr:full:en': 'NO.299, Haxi Str., Nangang District, Harbin, Heilongjiang Province Harbin, HAERBIN, 150000',
'addr:full:zh': '黑龙江省哈尔滨市, 哈尔滨市南岗区哈西大街299号哈西红博购物广场一层F1023单元, 哈尔滨市, 150000',
'branch:en': 'Harbin Haxi Hongbo Store',
'branch:zh': '哈尔滨西城红场店',
'@source_uri': 'https://www.starbucks.com.cn/api/stores/nearby?lat=46.0310334&lon=127.6518582&limit=1000&locale=ZH&features=&radius=100000',
'amenity': 'cafe',
'cuisine': 'coffee_shop',
'@spider': 'starbucks_cn',
'brand:en': 'Starbucks',
'brand:zh': '星巴克',
'name:en': 'Starbucks',
'name:zh': '星巴克',
'takeaway': 'yes',
'addr:full': '黑龙江省哈尔滨市, 哈尔滨市南岗区哈西大街299号哈西红博购物广场一层F1023单元, 哈尔滨市, 150000',
'addr:street_address': '黑龙江省哈尔滨市',
'addr:city': '哈尔滨市',
'addr:postcode': '150000',
'addr:country': 'CN',
'name': '星巴克',
'branch': '哈尔滨西城红场店',
'brand': '星巴克',
'brand:wikidata': 'Q37158',
'nsi_id': 'starbucks-823e31'
}
qa.remove_or_fix_bad_links(data, 'starbucks_cn')
qa.remove_bad_data_wrapped(data, 'starbucks_cn')