diff --git a/0_config.py b/0_config.py index 20e1060..1a3120b 100644 --- a/0_config.py +++ b/0_config.py @@ -10,11 +10,12 @@ def is_nonlocal_phone_worth_mentioning(atp_code): if datetime.datetime.now() < datetime.datetime(2024, 8, 27): return False if atp_code in [ - 'sainsburys', # https://github.com/alltheplaces/alltheplaces/pull/9242 + 'sainsburys', # https://github.com/alltheplaces/alltheplaces/pull/9242 ]: return False return True + def is_mismatching_name_worth_mentioning(atp_code): if atp_code in [ 'disc_replay_us', # https://github.com/alltheplaces/alltheplaces/pull/9410 - merged TODO, remove aroun 15 VIII 2024 @@ -23,72 +24,76 @@ def is_mismatching_name_worth_mentioning(atp_code): #'lewiatan_pl', # https://www.openstreetmap.org/changeset/148741713 - escalate to note when on top, move below pulls # see also https://www.openstreetmap.org/note/4349666 for Lewiatan # and https://www.openstreetmap.org/note/4349667 - 'coop_food_gb', # https://github.com/alltheplaces/alltheplaces/pull/8951 - stuck for now - 'speedy_stop_us', # https://github.com/alltheplaces/alltheplaces/pull/9411 - stuck - 'blue_bottle_liquors_za', # requires local knowledge - ask after higher ranked ones (including PRs) are processed - "waffle_house_us", # https://github.com/alltheplaces/alltheplaces/issues/8783 - 'seven_eleven_ca_us', # tricky code, lets create an issue for it when above will get processed - 'poczta_polska_pl', # quite tricky, neither "name=UP Mosina" nor "name=Poczta Polska" are clearly wrong - 'nicholsons_pubs_gb', # not totally wrong, but have an extra elements - 'champion_us', # https://github.com/alltheplaces/alltheplaces/pull/9114 - broken, requires fixing + 'coop_food_gb', # https://github.com/alltheplaces/alltheplaces/pull/8951 - stuck for now + 'speedy_stop_us', # https://github.com/alltheplaces/alltheplaces/pull/9411 - stuck + 'blue_bottle_liquors_za', # requires local knowledge - ask after higher ranked ones (including PRs) are processed + "waffle_house_us", # https://github.com/alltheplaces/alltheplaces/issues/8783 + 'seven_eleven_ca_us', # tricky code, lets create an issue for it when above will get processed + 'poczta_polska_pl', # quite tricky, neither "name=UP Mosina" nor "name=Poczta Polska" are clearly wrong + 'nicholsons_pubs_gb', # not totally wrong, but have an extra elements + 'champion_us', # https://github.com/alltheplaces/alltheplaces/pull/9114 - broken, requires fixing # not obvious how - 'bank_of_scotland_gb', # https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/bank_of_scotland_gb.py - 'kiehls_us', # https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/kiehls_us.py + 'bank_of_scotland_gb', # https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/bank_of_scotland_gb.py + 'kiehls_us', # https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/kiehls_us.py ]: return False return datetime.datetime.now() > datetime.datetime(2024, 8, 15) + def is_null_specified_as_text_worth_mentioning(atp_code): - return False # TODO remove once ATP updates, several such issues were solved + return False # TODO remove once ATP updates, several such issues were solved return atp_code not in [ # undefined - 'teriyaki_madness_us', # https://github.com/alltheplaces/alltheplaces/issues/8830 (TODO remove once ATP updates) + 'teriyaki_madness_us', # https://github.com/alltheplaces/alltheplaces/issues/8830 (TODO remove once ATP updates) # for next one: # eg_america_us.geojson has addr:street_address = Undefined # b/n - 'paczkomat_inpost_pl', # https://github.com/alltheplaces/alltheplaces/issues/8844 TODO remove once ATP updates - 'poczta_polska_pl', 'groszek_pl', # https://github.com/alltheplaces/alltheplaces/issues/8994 https://github.com/alltheplaces/alltheplaces/pull/8998 + 'paczkomat_inpost_pl', # https://github.com/alltheplaces/alltheplaces/issues/8844 TODO remove once ATP updates + 'poczta_polska_pl', 'groszek_pl', # https://github.com/alltheplaces/alltheplaces/issues/8994 https://github.com/alltheplaces/alltheplaces/pull/8998 ] + def is_failed_geocoding_worth_mentioning(atp_code): if atp_code in [ - 'allegro_one_box_pl', # https://github.com/alltheplaces/alltheplaces/issues/8900 (TODO: remove when fixed) - 'cukiernia_sowa_pl', # https://github.com/alltheplaces/alltheplaces/pull/6117 (pinged only, create issue/PR) - 'lewiatan_pl', # https://github.com/alltheplaces/alltheplaces/issues/9210 - 'tegut_de', # https://github.com/alltheplaces/alltheplaces/issues/9212 + 'allegro_one_box_pl', # https://github.com/alltheplaces/alltheplaces/issues/8900 (TODO: remove when fixed) + 'cukiernia_sowa_pl', # https://github.com/alltheplaces/alltheplaces/pull/6117 (pinged only, create issue/PR) + 'lewiatan_pl', # https://github.com/alltheplaces/alltheplaces/issues/9210 + 'tegut_de', # https://github.com/alltheplaces/alltheplaces/issues/9212 ]: return False if datetime.datetime.now() < datetime.datetime(2024, 8, 10): return False return True + def is_missing_brand_field_worth_mentioning(atp_code): if atp_code in [ - 'amf_bowling', # requires USA and/or bowling knowledge to recognize brands and non-brands - 'first_cash', # requires USA and/or pawnbroker knowledge to handle this - 'totalenergies', # broken/blocked website - 'lukoil', 'mol', # too complex for me - multibrand spiders - 'stadt_zuerich_ch', # special case, lets skip it (bunch of municipal data) - 'phillips_66_conoco_76', # there is request for improvement in ATP itself spider itself - 'mkrf_ru', # cultural institutions, brand is a bit weird here - 'gov_osservaprezzi_carburanti_it', # seems to have at least some independent - not sure is brand extracted where it is viable, but it would require more specialised knowledge. And anyway, claimed amenity=fuel do not seem plausible - 'ich_tanke_strom', # looks like it really has no brand data - 'vr_bank_de', # not really able to say is there brand here... - 'societe_generale', # maybe create https://github.com/alltheplaces/alltheplaces/issues/new + 'amf_bowling', # requires USA and/or bowling knowledge to recognize brands and non-brands + 'first_cash', # requires USA and/or pawnbroker knowledge to handle this + 'totalenergies', # broken/blocked website + 'lukoil', 'mol', # too complex for me - multibrand spiders + 'stadt_zuerich_ch', # special case, lets skip it (bunch of municipal data) + 'phillips_66_conoco_76', # there is request for improvement in ATP itself spider itself + 'mkrf_ru', # cultural institutions, brand is a bit weird here + 'gov_osservaprezzi_carburanti_it', # seems to have at least some independent - not sure is brand extracted where it is viable, but it would require more specialised knowledge. And anyway, claimed amenity=fuel do not seem plausible + 'ich_tanke_strom', # looks like it really has no brand data + 'vr_bank_de', # not really able to say is there brand here... + 'societe_generale', # maybe create https://github.com/alltheplaces/alltheplaces/issues/new # societe_generale - add missing brand tag based on location # It seems to have different acceptable values but for example in France it is always # but: is it always the same in France? # see http://overpass-turbo.eu/s/1PaT # I opened https://www.openstreetmap.org/note/4365418 for now - 'gov_bio123_de', # I see no way to reliably pull brands from that + 'gov_bio123_de', # I see no way to reliably pull brands from that ]: return False # very rarely can be acted upon so far, other reports are more fruitful if datetime.datetime.now() < datetime.datetime(2025, 8, 10): return False + def consider_reporting_broken_spider(atp_code): # waiting for https://github.com/alltheplaces/alltheplaces/issues/8845 at least # https://github.com/alltheplaces/alltheplaces/issues/8791#issuecomment-2230123715 - detect proxy failures @@ -98,6 +103,7 @@ def consider_reporting_broken_spider(atp_code): #print()def table_row(atp_code, statistics): return + def cache_folder(): return os.getenv("OSM_ATM_MATCHER_CACHE_FOLDER") + "/" @@ -109,15 +115,19 @@ def atp_cache_folder(): def atp_unpacked_folder(): return atp_cache_folder() + "output/" + def build_temporary_files(): return cache_folder() + "build_temporary_files/" + def nominatim_requests_missing_from_cache(): return build_temporary_files() + "nominatim_requests_missing_from_cache.txt" + def link_status_requests_missing_from_cache(): return build_temporary_files() + "link_status_requests_missing_from_cache.txt" + def atp_tags_very_likely_not_usable_for_osm_import(): return [ "@spider", # internal data, but useful for later identification @@ -146,7 +156,7 @@ def atp_tags_very_likely_not_usable_for_osm_import(): # OSM tag, but ATP often uses where it have on parsing address "addr:full", - 'branch', # full of fake data rather than actual branch data, see + 'branch', # full of fake data rather than actual branch data, see # https://github.com/alltheplaces/alltheplaces/pull/8782 # https://github.com/alltheplaces/alltheplaces/pull/8843 # but removed on request @@ -161,7 +171,7 @@ def atp_tags_very_likely_not_usable_for_osm_import(): 'charge:cng', 'charge:adblue', 'charge:biodiesel', - 'charge', # 0.40 GBP/kWh etc on man_made=charging_point + 'charge', # 0.40 GBP/kWh etc on man_made=charging_point # this value is a rename of what ATP gives as name tag # this value rename was done as warning - quality of it in ATP is not good enough @@ -184,8 +194,8 @@ def atp_tags_very_likely_not_usable_for_osm_import(): def atp_tags_to_be_remove_completely_and_ignored(): return [ - 'ref:google', # confirm that it is not related to fetching data FROM google TODO - 'ref:facebook', # what is that? + 'ref:google', # confirm that it is not related to fetching data FROM google TODO + 'ref:facebook', # what is that? "nsi_id", # internal data # unsuitable for inclusion in OSM, but still useful for matching... @@ -193,11 +203,12 @@ def atp_tags_to_be_remove_completely_and_ignored(): #"addr:street_address", ] + def is_zyte_proxy_spider(atp_code): if atp_code in [ # Zyte API account suspended # https://github.com/alltheplaces/alltheplaces/issues/6433 - 'obi_ru', # https://alltheplaces-data.openaddresses.io/runs/2024-06-22-13-32-09/logs/obi_ru.txt + 'obi_ru', # https://alltheplaces-data.openaddresses.io/runs/2024-06-22-13-32-09/logs/obi_ru.txt 'mitre_10_nz', 'gap_us', ]: return True @@ -222,7 +233,7 @@ def is_zyte_proxy_spider(atp_code): def is_empty_file_for_spider_worth_mentioning(atp_code): if atp_code in [ - 'first_national_real_estate_au', # https://github.com/alltheplaces/alltheplaces/issues/8693 + 'first_national_real_estate_au', # https://github.com/alltheplaces/alltheplaces/issues/8693 ]: return False if is_zyte_proxy_spider(atp_code): @@ -237,20 +248,20 @@ def is_broken_file_for_spider_worth_mentioning(atp_code): if atp_code in [ "fedex", # https://github.com/alltheplaces/alltheplaces/issues/6789 ]: - return False # reported already - #silenced for now, start reporting again if ATP welcomes and processes such reports - #print(item_path, "has broken file, parsing fails") + return False # reported already + # silenced for now, start reporting again if ATP welcomes and processes such reports + # print(item_path, "has broken file, parsing fails") return False def ignored_atp_codes(): ignored = [ - 'moneygram', # https://github.com/alltheplaces/alltheplaces/issues/6784 - 'cukiernia_sowa_pl', # often amenity=cafe, not sure how to handle - 'our_airports', # not first party data, claims public domain - is it OK to use in OSM? + 'moneygram', # https://github.com/alltheplaces/alltheplaces/issues/6784 + 'cukiernia_sowa_pl', # often amenity=cafe, not sure how to handle + 'our_airports', # not first party data, claims public domain - is it OK to use in OSM? "nsw_ambulance_au", # no brand, not sure is brand applying to those... "victorian_government_road_safety_cameras_au", # no brand and it is really correct - 'mall_maverick', # suspect and low quality data, anyway has no brand fields + 'mall_maverick', # suspect and low quality data, anyway has no brand fields # unclear licensing situation # see https://github.com/alltheplaces/alltheplaces/issues/8790 @@ -270,7 +281,7 @@ def processing_plan(): 'Poland': { 'part_of': ["europe"], 'country_code': 'pl', - 'bbox': [{'min_lat': 48.99, 'max_lat': 54.87, 'min_lon': 14.07, 'max_lon': 24.13}], # see http://bboxfinder.com/ + 'bbox': [{'min_lat': 48.99, 'max_lat': 54.87, 'min_lon': 14.07, 'max_lon': 24.13}], # see http://bboxfinder.com/ 'accepted': ['lubaszka_pl', 'poczta_polska_pl', 'dino_pl', 'lewiatan_pl', 'castorama_pl', 'ziko_apteka_pl', 'berlin_doner_kebap_pl', 'mcdonalds_pl', 'zahir_kebab_pl', 'lidl_pl', 'empik_pl', 'czas_na_herbate_pl', "orange_pl", "zabka_pl", "biedronka_pl", 'grycan_pl', 'spar_pl', 'groszek_pl', 'aldi_nord_pl', 'dealz_pl', 'polomarket_pl', 'coral_travel_pl', 'black_red_white_pl', 'carrefour_pl', 'tmobile_pl', 'break_and_wash_pl', 'topaz_pl', 'auchan_pl', 'plus_pl', 'hebe_pl', 'neonet_pl', 'top_market_pl', 'itaka_pl', 'shop_4f_pl', 'pekao_pl', 'epaka_pl', 'circle_k_pl', 'da_grasso_pl', 'alior_bank_pl', 'good_lood_pl', 'agata_meble_pl', 'leroy_merlin_pl', 'pizzeria_105_pl', 'delikatesy_centrum_pl', 'stokrotka_pl', 'bricomarche_pl', 'play_pl', 'avia_pl', 'decathlon_pl', 'pko_bank_polski_pl', 'putka_pl', 'costa_coffee_pl', 'orlen_paczka_pl', 'paczkomat_inpost_pl', 'pijalniewedel_pl', 'rossmann_pl', 'allegro_one_box_pl', 'odido_pl', 'media_expert_pl',], 'ignored': [ # see https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/{spider_code}.py @@ -294,7 +305,7 @@ def processing_plan(): # http://bboxfinder.com/#47.260592,5.863953,55.094087,15.051270 'bbox': [{'min_lat': 47.260592, 'min_lon': 5.863953, 'max_lat': 55.094087, 'max_lon': 15.051270}], 'accepted': ['rewe_de', 'opel_rent_de', 'freddy_fresh_de', 'fressnapf_de', 'reformhaus_bacher_de', 'edeka_de', 'vodafone_de', 'steinecke_de', 'smileys_de', 'aldi_nord_de', 'endlich_ohne_de', 'alnatura_de', 'hit_de', 'lidl_de', 'netto_de', 'remax_de', 'tegut_de', 'hosselmann_de', 'deutsche_telekom_de', 'frittenwerk_de', 'super_bio_markt_de', 'penny_de', 'combi_de', 'denns_de', 'junge_de', 'diska_de', 'cadera_de', 'galeria_de', 'tedox_de', 'hinnerbaecker_de', 'sparkasse_de', 'dhl_express_de', 'wiener_feinbacker_heberer_de', 'decathlon_de', 'sanders_backstube_de', 'norma_de', 'dpd_de', 'aldi_sud_de', 'jacques_wein_depot_de', 'fussl_at_de', 'le_crobag_de', 'dominos_pizza_de', 'hypovereinsbank_de', 'saturn_de', 'gamestop_de', 'dunkin_de', 'toom_baumarkt_de', 'barbarossa_baeckerei_de', 'kieser_training_de', 'bilgro_de', 'avia_de', 'five_guys_de', 'kamps_de', 'allianz_de', 'bauspezi_de', 'rossmann_de', 'primark_de', 'markgrafen_getraenkemarkt_de', 'vlh_de', 'vr_bank_de', ], - 'ignored': ['commerzbank_de'], # https://github.com/alltheplaces/alltheplaces/issues/7600 + 'ignored': ['commerzbank_de'], # https://github.com/alltheplaces/alltheplaces/issues/7600 }, 'United States': { 'part_of': ["north-america"], @@ -312,7 +323,7 @@ def processing_plan(): ], 'accepted': ['arbys_us', 'verizon_us', 'anthonys_restaurants_us', 'target_us', 'nick_the_greek_us', 'mattress_depot_usa_us', 'ocharleys_us', 'quiktrip_us', 'eg_america_us', 'woods_coffee_us', 'festival_foods_us', 'evereve_us', 'buffalo_wild_wings_us', 'disc_replay_us', 'dirt_cheap_us', 'ross_dress_for_less_us', 'bi_mart_us', 'jdsports_us', 'good_2_go_stores_us', 'bubbas33_us', 'kohls_us', 'taco_time_northwest_us', 'columbia_us', 'go_games_toys_us', 'twenty_four_hour_fitness_us', 'peets_coffee_us', 'super_one_foods_us', 'regal_nails_us', 'european_wax_center_us', 'burlington_us', 'kiehls_us', 'buckle_us', 'rosauers_us', 'zambrero_us', 'dangelos_us', 'gap_us', 'the_exercise_coach_us', 'linde_direct_us', 'the_yard_milkshake_bar_us', 'champs_sports_us', 'zaxbys_us', 'goodyear_us', 'carters_us', 'cato_us', 'starbucks_us', 'arctic_circle_us', 'sunoco_us', 'penzeys_us', 'bluepearl_pet_hospital_us', 'noodles_and_company_us', 'tonys_fresh_market_us', 'ruler_foods_us', 'baskin_robbins_us', 'mattress_warehouse_us', 'teriyaki_madness_us', 'wellstar_us', 'united_dairy_farmers_us', 'the_big_biscuit_us', 'sinclair_us', 'shoppers_food_us', 'asics_us', 'primanti_bros_us', 'tower_loan_us', 'coen_markets_us', 'sun_loan_us', 'modern_market_us', 'taco_cabana_us', 'cvs_us', 'murphy_usa_us', 'lou_malnatis_pizzeria_us', 'palm_beach_tan_us', 'lands_end_us', 'jersey_mikes_us', 'oshkosh_us', 'rocket_us', 'bank_of_hawaii_us', 'mirabito_us', 'aldi_sud_us', 'piggly_wiggly_us', 'eightyfour_lumber_us', 'citizens_us', 'condado_tacos_us', 'carpet_one_floor_and_home_us', 'concentra_us', 'great_wolf_resorts_us', 'bravo_us', 'ameriprise_us', 'chevron_us', 'fazolis_us', 'bevmo_us', 'stater_bros_us', 'old_chicago_us', 'california_closets_us', 'ruby_tuesday_us', 'schnucks_us', 'the_paper_store_us', 'xtramart_us', 'longchamp_us', 'waffle_house_us', 'bricks_minifigs_ca_us', 'juice_press_us', 'fastbreak_us', 'bealls_us', 'alltown_fresh_us', 'rooms_to_go_us', 'metro_diner_us', 'northside_hospital_us', 'five_guys_us', 'valvoline_us', 'lidl_us', 'foodland_grocery_us', 'daiso_us', 'fleet_feet_us', 'solidcore_us', 'foodtown_us', 'ponderosa_bonanza_steakhouse_us', 'bobevans_us', 'bha_us', 'foodlion_us', 'rubios_us', 'take_5_us', 'krispy_kreme_us', 'paper_source_us', 'securcare_self_storage_us', 'joann_us', 'dunhams_sports_us', 'talbots_us', 'bashas_us', 'salad_and_go_us', 'petland_us', 'kipling_us', 'all_secure_self_storage_us', 'builders_firstsource_us', 'empres_us', 'checkers_us', 'buca_di_beppo_us', 'beacon_and_bridge_market_us', 'woodmans_markets_us', 'del_taco_us', 'munchs_supply_us', 'roadys_us', 'sizzler_us', 'dunnedwards_us', 'ribcrib_us', '99_cents_only_us', 'natural_grocers_us', 'providence_us', 'racetrac_us', 'soapy_noble_us', 'fast_stop_us', 'gulf_pr_us', 'jiffy_lube_us', 'bathandbodyworks_us', 'road_ranger_us', 'culvers_us', 'cousins_subs_us', 'careone_us', 'outdoor_supply_hardware_us', 'dw_fresh_market_us', 'hopdoddy_us', 'dominos_pizza_us', 'riesbeck_food_markets_us', 'medstar_us', 'fiesta_mart_us', 'ford_dealers_us', 'independent_financial_us', 'fuzzys_taco_shop_us', 'francescas_us', 'americas_kids_us', 'pinch_a_penny_us', 'sun_auto_service_us', 'big_10_mart_us', 'pizza_hut_us', 'deutsche_bank_us', 'madewell_us', 'lasik_plus_us', 'sur_la_table_us', 'food_fair_market_us', 'redners_us', 'wards_restaurant_us', 'target_optical_us', 'mt_bank_us', 'its_fashions_us', 'frank_pepe_us', 'tcc_us', 'trader_joes_us', 'new_seasons_market_us', 'scooters_coffee_us', 'ann_taylor_us', 'mapco_us', 'planet_smoothie_us', 'mr_gattis_pizza_us', 'total_wine_and_more_us', 'commercial_tire_us', 'att_us', 'cinnaholic_us', 'thorntons_us', 'victra_us', 'j_crew_us', 'food_city_us', 'bucees_us', 'orkin_us', 'strack_and_van_til_us', 'dickeys_barbecue_pit_us', 'hardees_us', 'crossroads_iga_us', 'forever_21_us', 'academy_us', 'dagostino_us', 'salon_centric_us', 'piada_us', 'captain_jays_us', 'shoe_carnival_us', 'allsups_yesway_us', 'potters_ace_home_center_us', 'pepperonis_us', 'simple_simons_pizza_us', 'seven_eleven_ca_us', 'pet_suites_us', 'family_fare_us', 'waxing_the_city_us', 'metro_us', 'casper_us', 'camping_world_us', 'cube_smart_us', 'calvin_klein_ca_us', 'key_food_us', 'big_louies_us', 'fleet_pride_us', 'mattress_firm_us', 'lane_bryant_us', 'monicals_pizza_us', 'marathon_petroleum_us', 'rotten_robbie_us', 'cumberland_farms_us', 'pga_tour_superstore_us', 'burberry_us', 'family_dollar_us', 'autozone_us', 'martins_us', 'buffalo_exchange_us', 'true_value_us', 'golf_galaxy_us', 'kwik_fill_us', 'xsport_fitness_us', 'blue_rhino_pr_us', 'alltown_us', 'crumbl_cookies_us', 'krist_oil_us', 'gamestop_us', 'rite_aid_us', 'big_boy_us', 'ra_sushi_us', 'hy_vee_us', 'pnc_bank_us', 'sleep_outfitters_us', 'swarovski_us', 'jmclaughlin_us', 'genghis_grill_us', 'carlsjr_us', 'rallys_us', 'round1_us', 'taco_johns_us', 'quickchek_us', 'clean_eatz_us', 'belle_tire_us', 'advance_auto_parts_us', '99_ranch_market_us', 'badcock_us', 'fastrac_cafe_us', 'daniels_jewelers_us', 'promedica_us', 'blains_farm_and_fleet_us', 'the_brass_tap_us', 'huddle_house_us', 'refuel_double_quick_us', 'champion_us', 'united_bank_us', 'van_wall_us', 'vocelli_pizza_us', 'speedee_us', 'windsor_us', 'krystal_us', 'marinemax_us', 'sleep_number_us', 'primark_us', 'mcmenamins_us', 'brookdale_us', 'coach_us', 'pjs_coffee_us', 'dairy_queen_us', 'ohio_cat_us', 'speedy_stop_us', 'state_farm_us', 'the_barre_code_us', 'george_webb_us', 'golub_corporation_us', 'sweetfrog_us', 'bravo_italian_us', 'tropical_smoothie_cafe_us', 'pure_barre_us', 'vestis_uniform_services_us', 'bealls_florida_us', 'new_hampshire_liquor_and_wine_us', 'touchstone_medical_imaging_us', 'leslies_poolmart_us', 'the_keg_ca_us', 'rouses_us', 'rodda_paint_us', 'ntb_us', 'chase_us', 'gigis_cupcakes_us', 'sals_pizza_us', 'caribou_coffee_us', 'dunkin_us', 'tgi_fridays_us', 'nandos_us', 'public_storage_us', 'biggby_coffee_us', 'afters_ice_cream_us', 'daylight_donuts_us', 'krispy_krunchy_chicken_us', 'kfc_us', 'carls_jr_us', 'bob_evans_us', 'laseraway_us', 'bath_and_body_works_us', 'jd_sports_us', 'baptist_health_arkansas_us', 'sutter_health_us', 'race_trac_us', 'pressed_us', 'loft_us', 'speedway_us', 'jacks_us', 'bremer_bank_us', 'travismathew_us', 'the_fresh_grocer_us', 'minnoco_us', 'lowes_us', 'publix_us', 'walmart_us', 'the_lash_lounge_us', 'kendra_scott_us', 'ted_baker_us', 'the_counter_us', 'terribles_us', 'bobs_burgers_us', 'pricerite_us', 'west_marine_us', 'kroger_us', ], 'ignored': [ - 'fresenius_kidney_care_us', 'amc_theatres_us', 'credit_union_us', 'travelcenters_of_america_us', 'kaiser_permanente_us', 'foodland_us', # brand field missing, not investigated why + 'fresenius_kidney_care_us', 'amc_theatres_us', 'credit_union_us', 'travelcenters_of_america_us', 'kaiser_permanente_us', 'foodland_us', # brand field missing, not investigated why ], }, 'United Kingdom': { @@ -378,7 +389,7 @@ def processing_plan(): 'country_code': 'se', 'accepted': ['hemmakvall_se', 'lidl_se', 'burger_king_se', 'systembolaget_se', 'bishops_arms_se', 'tele2_se', 'ingo_se', 'hemkop_se', 'circle_k_se', 'distriktstandvarden_se', 'nordea_se', 'coop_se', 'willys_se', 'seven_eleven_se', 'handelsbanken_se', 'ica_se', 'doz_apotek_se', 'stc_se'], 'ignored': [ - 'qstar_se', # brand is missing (why?) + 'qstar_se', # brand is missing (why?) ] }, 'Austria': { @@ -410,11 +421,11 @@ def processing_plan(): 'country_code': 'it', 'accepted': ['gov_osservaprezzi_carburanti_it', 'gnp_it', 'gamestop_it', 'giunti_al_punto_it', 'todis_it', 'arcaplanet_it', 'aldi_sud_it', 'tigota_it', 'lidl_it', 'comet_it', 'burger_king_it', 'kocca_it', 'primark_it', 'stroili_oro_it', 'prix_quality_it', 'ewiva_it', 'carrefour_it', 'md_it', 'remax_it', 'europam_it', 'decathlon_it', 'conad_it', 'naturasi_it', 'brico_ok_it', 'kfc_it', 'vodafone_it', 'sigma_it', 'brico_io_it', 'risparmio_casa_it', 'yves_rocher_it'], 'ignored': [ - 'bricofer_it', # https://community.openstreetmap.org/t/italia-aggiunta-al-matcher-sperimentale-di-all-the-places/117208/8 + 'bricofer_it', # https://community.openstreetmap.org/t/italia-aggiunta-al-matcher-sperimentale-di-all-the-places/117208/8 ] }, 'Luxembourg': { - 'part_of': ["europe/luxembourg"], # deliberately not Europe, to alow small quick test for people new to this tool + 'part_of': ["europe/luxembourg"], # deliberately not Europe, to alow small quick test for people new to this tool 'bbox': [ # http://bboxfinder.com/#49.445807,5.718384,50.185472,6.536179 {'min_lat': 49.445807, 'min_lon': 5.718384, 'max_lat': 50.185472, 'max_lon': 6.536179}, @@ -425,6 +436,7 @@ def processing_plan(): }, } + """ 'Country': { 'part_of': ["europe"], @@ -437,16 +449,19 @@ def processing_plan(): }, """ + def good_match_distance_in_kilometers(): return 0.1 + def maximum_missing_shop_distance_in_kilometers(): return 0.9 + def missing_shop_distance_in_kilometers_for_specific_case(object_tags, spider_code=None): if spider_code == None: spider_code = object_tags["@spider"] - poor_location_data = ['poczta_polska_pl'] # detect it automatically if possible based on success matches + poor_location_data = ['poczta_polska_pl'] # detect it automatically if possible based on success matches if spider_code in poor_location_data: return maximum_missing_shop_distance_in_kilometers() if spider_code in ['allegro_one_box_pl', 'orlen_paczka_pl', 'paczkomat_inpost_pl']: @@ -508,11 +523,12 @@ def canonical_feature(object_tags): if key.startswith(prefix): main_key = key.replace(prefix, "") return main_key + "=" + object_tags[main_key] - return "?" # failed to detect any indicator + return "?" # failed to detect any indicator + def canonical_feature_obvious_matches(object_tags): if object_tags.get("shop") == "ice_cream" or object_tags.get("amenity") == "ice_cream": - return "ice_cream" # synonym + return "ice_cream" # synonym if object_tags.get("shop") in ["bakery", "pastry", "confectionery"]: return "bakery/bakery-like" if object_tags.get("amenity") in ["fast_food", "restaurant"]: @@ -521,7 +537,7 @@ def canonical_feature_obvious_matches(object_tags): return "alcohol_shop" if "shop" in object_tags: if object_tags["shop"] in ["convenience", "supermarket"]: - return "convenience/supermarket" # some are on edge of this two + return "convenience/supermarket" # some are on edge of this two if object_tags.get("amenity") == "vending_machine": if "vending" in object_tags: @@ -544,4 +560,4 @@ def canonical_feature_obvious_matches(object_tags): def show_info_about_spider_to_debug_it(atp_code): print("https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/" + atp_code + ".py") - print(atp_unpacked_folder() + atp_code + '.geojson') # TODO can this assumption that atp_code matches filename be made here? + print(atp_unpacked_folder() + atp_code + '.geojson') # TODO can this assumption that atp_code matches filename be made here? diff --git a/1_obtain_osm_data.py b/1_obtain_osm_data.py index cd30509..605176b 100644 --- a/1_obtain_osm_data.py +++ b/1_obtain_osm_data.py @@ -5,6 +5,7 @@ import shops import time config = __import__("0_config") + def main(): directory_path = config.cache_folder() @@ -22,10 +23,11 @@ def main(): print(region, "- list_shops - started") start = time.time() for entry in shops.osm.list_shops(region, directory_path): - pass # needed to trigger processing code + pass # needed to trigger processing code print((time.time() - start) / 60, "minutes") print(region, "- list_shops - completed") processed.append(region) + if __name__ == "__main__": main() diff --git a/2_obtain_atp_data.py b/2_obtain_atp_data.py index 6d5a5e8..1d8aad5 100644 --- a/2_obtain_atp_data.py +++ b/2_obtain_atp_data.py @@ -1,18 +1,20 @@ +import rich +import osm_bot_abstraction_layer.util_download_file +import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge import json import os import requests config = __import__("0_config") -import osm_bot_abstraction_layer.tag_knowledge as tag_knowledge -import osm_bot_abstraction_layer.util_download_file -import rich config = __import__("0_config") + def processed_atp_codes(): for area_name, area_data in config.processing_plan().items(): if 'accepted' in area_data: for atp_code in area_data['accepted']: yield atp_code + def main(): response = requests.get("https://data.alltheplaces.xyz/runs/latest.json") todos = json.loads(response.text) @@ -21,14 +23,15 @@ def main(): download_entire_atp_dataset(run_id) look_through_entire_atp_dataset() + def do_not_remind_that_this_tagging_may_be_worth_supporting(): notified_about_tag = { # should be fixed in ATP, if possible # TODO: raise isses at https://github.com/alltheplaces/alltheplaces/issues 'tourism': ['yes', 'attraction'], 'healthcare': [ - 'laboratory', # https://github.com/alltheplaces/alltheplaces/issues/8637 - 'centre', # not reported yet TODO + 'laboratory', # https://github.com/alltheplaces/alltheplaces/issues/8637 + 'centre', # not reported yet TODO ], # TODO maybe start including them? @@ -48,11 +51,11 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting(): if key not in notified_about_tag: notified_about_tag[key] = [] - #missing shoplike + # missing shoplike notified_about_tag['office'].append('yes') notified_about_tag['amenity'].append('canteen') - #kind also shoplike? I want to support them + # kind also shoplike? I want to support them notified_about_tag['man_made'].append('charge_point') notified_about_tag['amenity'].append('music_venue') notified_about_tag['amenity'].append('prep_school') @@ -117,9 +120,10 @@ def do_not_remind_that_this_tagging_may_be_worth_supporting(): # seem hard to confirm by survey notified_about_tag['craft'].append('brewery') - notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office + notified_about_tag['amenity'].append('post_depot') # internal facility, not post_office return notified_about_tag + def warn_about_broken_spider(atp_code, message): print() print() @@ -130,24 +134,27 @@ def warn_about_broken_spider(atp_code, message): print(url_log) print() + def maybe_warn_about_spider_with_empty_file(atp_code): if config.is_empty_file_for_spider_worth_mentioning(atp_code): warn_about_broken_spider(atp_code, "empty output") + def maybe_warn_about_spider_with_broken_file(atp_code): if config.is_broken_file_for_spider_worth_mentioning(atp_code): warn_about_broken_spider(atp_code, "broken output file") + def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path): notified_about_tag = do_not_remind_that_this_tagging_may_be_worth_supporting() for key in tag_knowledge.typical_main_keys(): if key in entry['properties']: if ";" in entry['properties'][key]: - continue # TODO - is it safe to consider it as being unfinished? + continue # TODO - is it safe to consider it as being unfinished? if key == 'healthcare' and entry['properties'].get("amenity") == entry['properties'].get("healthcare"): continue if tag_knowledge.is_shoplike({key: entry['properties'][key]}) == True: - break # handles cases where healthcare is extra tag in addition to proper amenity + break # handles cases where healthcare is extra tag in addition to proper amenity if tag_knowledge.is_shoplike({key: entry['properties'][key]}) == False: if key not in notified_about_tag: notified_about_tag[key] = [] @@ -159,6 +166,7 @@ def warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path): print() notified_about_tag[key].append(entry['properties'][key]) + def all_spider_codes_iterator(): directory_path_with_unpacked_spider_data = config.atp_unpacked_folder() # TODO: Is there match between spider codes and their filenames? @@ -180,6 +188,7 @@ def download_entire_atp_dataset(run_id): osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, config.atp_cache_folder(), filename) os.system('unzip "' + config.atp_cache_folder() + filename + '" -d "' + config.atp_cache_folder() + '"') + def look_through_entire_atp_dataset(): candidates = {} for _area_name, area_data in config.processing_plan().items(): @@ -200,10 +209,10 @@ def look_through_entire_atp_dataset(): continue warn_about_new_tags_that_are_neither_shoplike_nor_ignored(entry, item_path) if atp_code in [ - 'hyatt', # https://github.com/alltheplaces/alltheplaces/issues/9399 - 'maserati', # has many actually empty entries - 'skoda', # limitations of source data, unfixable by ATP - 'general_logistics_systems_de', # missing data to provide POI data, see https://github.com/alltheplaces/alltheplaces/commit/89f5511bacf24f2d6d0a1c2a183130c9148f772a + 'hyatt', # https://github.com/alltheplaces/alltheplaces/issues/9399 + 'maserati', # has many actually empty entries + 'skoda', # limitations of source data, unfixable by ATP + 'general_logistics_systems_de', # missing data to provide POI data, see https://github.com/alltheplaces/alltheplaces/commit/89f5511bacf24f2d6d0a1c2a183130c9148f772a ]: break if config.canonical_feature(entry['properties']) == "?": @@ -226,11 +235,12 @@ def look_through_entire_atp_dataset(): print() print("candidate") print(item_path) - if atp_code not in candidates[area_data['country_code']]: # applies when one spider has many main keys + if atp_code not in candidates[area_data['country_code']]: # applies when one spider has many main keys candidates[area_data['country_code']].append(atp_code) print(key, "=", entry['properties'][key]) print(candidates) + def download(code, run_id): script_location = os.path.abspath(__file__) directory_path = config.cache_folder() @@ -238,5 +248,6 @@ def download(code, run_id): filename = code + ".atp.geojson" osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(download_url, directory_path, filename) + if __name__ == "__main__": main() diff --git a/3_matcher.py b/3_matcher.py index 24f18e1..fe83b8c 100644 --- a/3_matcher.py +++ b/3_matcher.py @@ -52,6 +52,7 @@ def is_matching_any_name_part_to_osm_tags(name_part_list, osm_tags): return True return False + def matching_name_part(part, namelike_value): part = part.lower() namelike_value = namelike_value.lower() @@ -63,6 +64,7 @@ def matching_name_part(part, namelike_value): else: return True + def filter_with_fuzzy_name_match(osm_data, name_part_list): returned = [] for osm in osm_data: @@ -70,6 +72,7 @@ def filter_with_fuzzy_name_match(osm_data, name_part_list): returned.append(osm) return returned + def get_filter_names_from_atp_dataset(current_atp): filter_names = [] for atp in current_atp: @@ -79,12 +82,13 @@ def get_filter_names_from_atp_dataset(current_atp): name_sources.append(short_name) for name in name_sources: for part in name.split(): - if part.lower() in ["kebab", "kebap", "apteka", "cukiernia", "pizzeria", "na"]: # common and shared - automate detection? + if part.lower() in ["kebab", "kebap", "apteka", "cukiernia", "pizzeria", "na"]: # common and shared - automate detection? continue if part not in filter_names: filter_names.append(part) return filter_names + def run_match(osm_data, atp_code): output_file = "build_temporary_files/" + atp_code + '.csv' atp_data = load_atp(atp_code) @@ -93,11 +97,12 @@ def run_match(osm_data, atp_code): matches = get_matches(osm_data, atp_data) serializing.save_list_of_matches_to_csv(output_file, matches) + def get_matches(osm_data, atp_data): match_list = [] filter_names = get_filter_names_from_atp_dataset(atp_data) - #TODO: get also misspellings - #TODO: handle nearby objects with matching feature type or vacant ones + # TODO: get also misspellings + # TODO: handle nearby objects with matching feature type or vacant ones filtered_osm = filter_with_fuzzy_name_match(osm_data, filter_names) osm_index = spatial_index.SpatialIndex(filtered_osm) print("filtering", len(osm_data), "to", len(filtered_osm), "candidates based on names is done, now checking", len(atp_data), "ATP candidates by distance") @@ -145,6 +150,7 @@ def get_matches(osm_data, atp_data): raise return match_list + def passed_filter(osm_data_tag_filter, tags): for key in osm_data_tag_filter.keys(): if osm_data_tag_filter[key] == None: @@ -173,6 +179,7 @@ def load_atp(atp_code): # no need to report also here, so lets fail silently return [] + def load_atp_from_json(data, atp_code): returned = [] for entry in data['features']: diff --git a/4_show_data.py b/4_show_data.py index 9a45cb6..4d7339a 100644 --- a/4_show_data.py +++ b/4_show_data.py @@ -18,6 +18,7 @@ import distance_distribution config = __import__("0_config") + def get_free_space_in_mb(path): total, used, free = shutil.disk_usage(path) return free / 1024 / 1024 @@ -54,6 +55,7 @@ def generate_report(cache_only): copy_data_for_publication(all_atp_codes) publish_data_on_internet() + def generate_bot_edit_list_page(): with open("output/bot_edit_plan_add_tags.html", 'w') as outfile: outfile.write(html_bot_edit_prefix()) @@ -75,6 +77,7 @@ def generate_bot_edit_list_page(): outfile.write('<li><a target="_blank" href="' + osm_link + '">' + osm_link + '</a>' + " <code>" + escape_html(key) + "=" + value + "</code></li>\n") outfile.write(html_bot_edit_suffix()) + def contact_method(): return """Please <a target="_blank" href="https://codeberg.org/matkoniecz/improving_openstreetmap_using_alltheplaces_dataset/issues">create an issue</a> or <a target="_blank" href="https://www.openstreetmap.org/message/new/Mateusz%20Konieczny">send me an OSM private message</a> if you see a potential for improvements. If potential improvements are in All The Places - better to create PR or issue there. If unsure, please write to me. If you see this data being misued and causing harm (for example, imported without consulting community or ignoring their feedback) - please write to me and I will help with cleanups, including reverts and reconsider how this data is publish.""" @@ -109,6 +112,7 @@ def html_bot_edit_prefix(): <h2>Edit ideas listing</h2> """ + def html_bot_edit_suffix(): return """</section> </body> @@ -121,12 +125,13 @@ def clear_output_files(folder): if os.path.isfile(file_path): os.remove(file_path) + def produce_map_analysis_for_atp_code(atp_code, cache_only, url_checker_instance, report_generators): csv_filepath = "build_temporary_files/" + atp_code + '.csv' if os.path.isfile(csv_filepath) == False: report_generators[atp_code] = { 'atp_file_is_broken': True - } + } return report_generators match_list = serializing.load_list_of_matches_from_csv(csv_filepath) @@ -142,12 +147,13 @@ def produce_map_analysis_for_atp_code(atp_code, cache_only, url_checker_instance processed = qa.remove_bad_data(entry.atp_tags, atp_code) if processed == None: continue - entry.atp_tags = processed # TODO is it happening as data was passed to qa function + entry.atp_tags = processed # TODO is it happening as data was passed to qa function rebuild_match_list.append(entry) report_generators[atp_code] = produce_map_analysis_for_atp_data(atp_code, area_name="", match_list=rebuild_match_list, cache_only=cache_only, url_checker_instance=url_checker_instance) return report_generators + class MismatchingNameReportCreator: def __init__(self, atp_code, area_name): self.atp_code = atp_code @@ -227,9 +233,9 @@ class MismatchingNameReportCreator: def table_of_contents(self): return [ { - 'header': "Name mismatch", - 'section_link': section_link('name mismatch between OSM and ATP', len(self.completely_mismatching_names), self.report_filename()), - 'output_files': [self.report_filename()] + 'header': "Name mismatch", + 'section_link': section_link('name mismatch between OSM and ATP', len(self.completely_mismatching_names), self.report_filename()), + 'output_files': [self.report_filename()] }, ] @@ -247,6 +253,7 @@ class MismatchingNameReportCreator: #outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], missing.osm_match_center['lat'], missing.osm_match_center['lon'], color = 'blue')) outfile.write(leafleter.generator.get_html_page_suffix()) + class ATPGivesTagsReportCreator: def __init__(self, url_checker_instance, atp_code, area_name): self.atp_code = atp_code @@ -281,7 +288,7 @@ class ATPGivesTagsReportCreator: writer = csv.writer(outfile) for key in self.importable_keys: if self.count_of_total_tag_mismatches[key] == 0: - for entry in self.shops_with_tags_to_be_added: # already Nominatim-filtered + for entry in self.shops_with_tags_to_be_added: # already Nominatim-filtered if key in entry['tags_to_be_added']: value = entry['entry'].atp_tags[key] atp_code = entry['entry'].atp_tags['@spider'] @@ -344,7 +351,7 @@ class ATPGivesTagsReportCreator: return False returned = self.url_checker.is_website_eligible(atp, cache_only) if returned == None: - return False # not cached, instructed to use only cache + return False # not cached, instructed to use only cache if returned: if atp.atp_tags.get(tested_key) != atp.osm_match_tags.get(tested_key): return True @@ -356,7 +363,7 @@ class ATPGivesTagsReportCreator: # pointing to the main brand page pass elif self.url_checker.is_difference_limited_to_slash_at_end(atp_value, osm_value): - pass # effectively the same anyway, no real mismatch + pass # effectively the same anyway, no real mismatch else: self.mismatching_website_tags.append(atp) self.report_mismatch(atp, tested_key) @@ -402,7 +409,7 @@ class ATPGivesTagsReportCreator: new_tags[key] = entry['entry'].atp_tags[key] message += tag_list_to_html(new_tags) outfile.write(leafleter.generator.get_marker(message, entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], color='green')) - outfile.write(leafleter.generator.get_line(entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], entry['entry'].osm_match_center['lat'], entry['entry'].osm_match_center['lon'], color = 'green')) + outfile.write(leafleter.generator.get_line(entry['entry'].atp_center['lat'], entry['entry'].atp_center['lon'], entry['entry'].osm_match_center['lat'], entry['entry'].osm_match_center['lon'], color='green')) outfile.write(leafleter.generator.get_html_page_suffix()) def generate_mismatching_website_listing(self): @@ -422,9 +429,10 @@ class ATPGivesTagsReportCreator: summary += 'tag list as suggested by ATP (should not be assumed to be directly usable in OSM):<br><br>' summary += tag_list_to_html(bad.atp_tags) outfile.write(leafleter.generator.get_marker(summary, bad.atp_center['lat'], bad.atp_center['lon'], color='red')) - outfile.write(leafleter.generator.get_line(bad.atp_center['lat'], bad.atp_center['lon'], bad.osm_match_center['lat'], bad.osm_match_center['lon'], color = 'red')) + outfile.write(leafleter.generator.get_line(bad.atp_center['lat'], bad.atp_center['lon'], bad.osm_match_center['lat'], bad.osm_match_center['lon'], color='red')) outfile.write(leafleter.generator.get_html_page_suffix()) + class MissingObjectsReportCreator: def __init__(self, atp_code, area_name): self.area_name = area_name @@ -488,6 +496,7 @@ class MissingObjectsReportCreator: bad_tags_skipped.append(atp) json.dump(serializing.generate_geojson_structure(bad_tags_skipped), f) + class NominatimMismatchReportCreator: def __init__(self, atp_code, area_name): self.area_name = area_name @@ -546,9 +555,10 @@ class NominatimMismatchReportCreator: summary += tag_list_to_html(missing.atp_tags) outfile.write(leafleter.generator.get_marker(summary, missing.atp_center['lat'], missing.atp_center['lon'], color='red')) location_from_nominatim = nominatim.location_given_tags(missing.atp_tags, debug_identifier=self.atp_code)[0] - outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], location_from_nominatim['lat'], location_from_nominatim['lon'], color = 'red')) + outfile.write(leafleter.generator.get_line(missing.atp_center['lat'], missing.atp_center['lon'], location_from_nominatim['lat'], location_from_nominatim['lon'], color='red')) outfile.write(leafleter.generator.get_html_page_suffix()) + """ class MismatchingNameReportCreator: def __init__(self, atp_code): @@ -577,6 +587,8 @@ add to test_display_website """ # TODO: passing atp_code should not be needed # TODO: save files one level higher, here just produce analysis + + def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_only, url_checker_instance): missing_objects_report = MissingObjectsReportCreator(atp_code, area_name) mismatching_name_report = MismatchingNameReportCreator(atp_code, area_name) @@ -613,7 +625,7 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl conflict_between_atp_and_nominatim_report.register_case_using_known_nominatim_status(atp, status) elif atp.match_distance == None or atp.match_distance > config.missing_shop_distance_in_kilometers_for_specific_case(atp.atp_tags): nominatim_match = nominatim.is_location_matching_tags(atp.atp_tags, atp.atp_center, cache_only=cache_only, spider=atp_code) - if nominatim_match != False: # both matches, failed geolocation and geolocation not done at all go here + if nominatim_match != False: # both matches, failed geolocation and geolocation not done at all go here missing_objects_report.check_case(atp) conflict_between_atp_and_nominatim_report.register_case_using_known_nominatim_status(atp, nominatim_match) else: @@ -635,7 +647,8 @@ def produce_map_analysis_for_atp_data(atp_code, area_name, match_list, cache_onl 'missing_objects_report': missing_objects_report, 'conflict_between_atp_and_nominatim_report': conflict_between_atp_and_nominatim_report, 'total_atp_entries': len(match_list), - } + } + def format_for_geojson_export(dataset): for entry in dataset: @@ -644,6 +657,7 @@ def format_for_geojson_export(dataset): del atp.atp_tags[key] return dataset + def get_center(dataset): max_lat = -90 max_lon = -180 @@ -671,6 +685,7 @@ def sidebar_content(page_specific_info, atp_code): sidebar += '<br><br>\n<a href="https://github.com/alltheplaces/alltheplaces/blob/master/locations/spiders/' + atp_code + '.py" target="_blank">atp source code</a>' return sidebar + def tag_list_to_html(tags): returned = "" normal_tags = "" @@ -691,21 +706,26 @@ def tag_list_to_html(tags): returned += "<br><br>tags present in ATP, very likely not usable directly in OSM<br>" + dropped_tags return returned + def htmlify_key_value_pair(key, value): return key + " = " + htmlify_value(key, value) + "<br>" + def htmlify_value(key, value): value = escape_html(value) if key == "website" or (key == "image" and value.find("http") == 0): value = '<a href="' + value + '">' + value + "</a>" return value + def escape_url(value): return str(value).replace('"', '%22').replace("'", "%27") + def escape_html(value): return html.escape(value).replace("\r\n", "<br>").replace("\n", "<br>") + def headers(): # TODO: pass it smarter in config (list of main report creators?) # or at least make it static method @@ -718,6 +738,7 @@ def headers(): NominatimMismatchReportCreator('dummy', 'dummy area name').table_of_contents()[0]['header'], ] + def generate_website_index_listing_by_country(report_generators, released_codes_by_region, partial=False): with open("output/index.html", 'w') as outfile: outfile.write(html_prefix()) @@ -732,6 +753,7 @@ def generate_website_index_listing_by_country(report_generators, released_codes_ outfile.write(table_with_spider_overview(atp_codes, report_generators, partial)) outfile.write(html_suffix()) + def generate_website_index_for_named_area(report_generators, area_name, partial=False): with open("output/" + area_name + "_index.html", 'w') as outfile: outfile.write(html_prefix()) @@ -739,6 +761,7 @@ def generate_website_index_for_named_area(report_generators, area_name, partial= outfile.write(table_with_spider_overview(report_generators.keys(), report_generators, partial)) outfile.write(html_suffix()) + def table_with_spider_overview(atp_codes, report_generators, partial): returned = "" returned += '<table class="statistics-summary"><thead><tr><th>' + '</th><th>'.join(headers()) + '</th></tr></thead>\n' @@ -756,8 +779,9 @@ def table_with_spider_overview(atp_codes, report_generators, partial): returned += "no entries shown in this area\n" return returned + def table_row(atp_code, statistics): - if statistics['missing_objects_report'] == None: #TODO test is it working + if statistics['missing_objects_report'] == None: # TODO test is it working return '<tr><th></th><td colspan="5">Data missing</td></tr>' missing_section = statistics['missing_objects_report'].table_of_contents()[0]['section_link'] @@ -772,11 +796,13 @@ def table_row(atp_code, statistics): return '<tr><th>' + atp_code + '</th><td>' + missing_section + '</td><td>' + mismatching_names_section + '</td><td>' + tags_section + '</td><td>' + website_mismatch_section + '</td><td>' + mismatch_section + not_attempted + '</td></tr>' + def section_link(description, count, page): if count == 0: return '<span class=less-visible title="' + description + '">' + str(count) + '</span>' return '<a href="' + page + '" title="' + description + '">' + str(count) + '</a>' + def contact_method(): return """Please <a href="https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data/issues">create an issue</a> or <a href="https://www.openstreetmap.org/message/new/Mateusz%20Konieczny">send me an OSM private message</a> if you see a potential for improvements. If potential improvements are in All The Places - better to create PR or issue <a href="https://github.com/alltheplaces/alltheplaces">there</a>. If unsure, please write to me.""" @@ -819,11 +845,13 @@ def html_prefix(): </p> """ + def html_suffix(): return """<hr><br>Published on <a href="https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/">https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/</a> - generated on """ + f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S%z}' + """ (note that ATP and OSM data used here may be older) </section> </body> </html>""" + def iterate_over_output_files(atp_code): reports = [ MissingObjectsReportCreator(atp_code, 'dummy area name'), @@ -836,6 +864,7 @@ def iterate_over_output_files(atp_code): for file in entry['output_files']: yield file + def copy_data_for_publication(all_atp_codes): for atp_code in all_atp_codes: if get_free_space_in_mb('../public_website_with_output') < 400: @@ -850,8 +879,10 @@ def copy_data_for_publication(all_atp_codes): os.system("cp output/index.html ../public_website_with_output/index.html") # published on https://matkoniecz.codeberg.page/improving_openstreetmap_using_alltheplaces_dataset/ + def publish_data_on_internet(): os.system('cd ../public_website_with_output && git add . && git commit -m "automatic update" && git push') + if __name__ == "__main__": main() diff --git a/5_generate_organic_map_bookmarks.py b/5_generate_organic_map_bookmarks.py index 7c9eaaa..89abec3 100644 --- a/5_generate_organic_map_bookmarks.py +++ b/5_generate_organic_map_bookmarks.py @@ -10,20 +10,25 @@ import shared obtain_atp_data = __import__("2_obtain_atp_data") config = __import__("0_config") + def is_in_this_area(area, atp): if atp.atp_center['lat'] > area['min_lat'] and atp.atp_center['lat'] < area['max_lat']: if atp.atp_center['lon'] > area['min_lon'] and atp.atp_center['lon'] < area['max_lon']: return True return False + def areas(): return { 'kraków': {'min_lat': 50, 'min_lon': 19.5, 'max_lat': 50.3, 'max_lon': 20.5}, # http://bboxfinder.com/#52.383301,16.885986,52.436182,17.044859 'poznań': {'min_lat': 52.383301, 'min_lon': 16.885986, 'max_lat': 52.436182, 'max_lon': 17.044859}, - } + } + general_statistics = {} + + def main(): for atp_code, _item_path in obtain_atp_data.all_spider_codes_iterator(): print(atp_code) @@ -55,6 +60,7 @@ def save_files(data, name): with open(name + '_missing.kml', 'w') as f: f.write(serializing.generate_kml_text(data)) + def clear_output_files(folder): for filename in os.listdir(folder): file_path = os.path.join(folder, filename) @@ -71,4 +77,5 @@ def generate_missing_shop_listing(atp_code, apparently_missing_shops): osm_location_link = shared.link_to_point_in_osm(missing.atp_center['lat'], missing.atp_center['lon']) summary = 'here ATP shows object being present, which seems not mapped in OpenStreetMap (<a href="' + osm_location_link + '">location</a>):<br><br>' + main() diff --git a/6_experimental_graticule_splitter.py b/6_experimental_graticule_splitter.py index 88cd982..40c723e 100644 --- a/6_experimental_graticule_splitter.py +++ b/6_experimental_graticule_splitter.py @@ -23,11 +23,13 @@ def graticule_id(lat, lon, lat_span, lon_span, margin_in_kilometers): # filter data for each # filter data for each in constant time (just check is given location within graticule range) + def main(): check_is_any_graticule_having_margin_greater_than_entire_graticule() generate_test_graticule_coverage_map() test_area_run() + def generate_test_graticule_coverage_map(): graticule_anchor_coverage = {'min_lat': 49, 'min_lon': 14, 'max_lat': 54, 'max_lon': 24} with open("test_coverage_graticule_display.html", 'w') as outfile: @@ -37,9 +39,10 @@ def generate_test_graticule_coverage_map(): for lat_anchor in range(graticule_anchor_coverage['min_lat'], graticule_anchor_coverage['max_lat'] + 1): for lon_anchor in range(graticule_anchor_coverage['min_lon'], graticule_anchor_coverage['max_lon'] + 1): shape = [[lat_anchor + 1, lon_anchor + 1], [lat_anchor + 1, lon_anchor], [lat_anchor, lon_anchor], [lat_anchor, lon_anchor + 1], [lat_anchor + 1, lon_anchor + 1]] - outfile.write(leafleter.generator.get_polygon(shape, color = "green", fill_color = "green", link = "https://pl.wikipedia.org/wiki/Pozna%C5%84")) + outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link="https://pl.wikipedia.org/wiki/Pozna%C5%84")) outfile.write(leafleter.generator.get_html_page_suffix()) + def test_area_run(): # http://bboxfinder.com/#52.383301,16.885986,52.436182,17.044859 poznań = {'min_lat': 52.383301, 'min_lon': 16.885986, 'max_lat': 52.436182, 'max_lon': 17.044859, 'name': 'Poznań'} @@ -57,7 +60,7 @@ def test_area_run(): outfile.write(leafleter.generator.get_html_page_prefix("website title", (area['max_lat'] + area['min_lat'])/2, (area['max_lon'] + area['min_lon'])/2)) #outfile.write(leafleter.generator.get_marker("text", 50.06, 19.93)) shape = [[area['max_lat'], area['max_lon']], [area['max_lat'], area['min_lon']], [area['min_lat'], area['min_lon']], [area['min_lat'], area['max_lon']], [area['max_lat'], area['max_lon']]] - outfile.write(leafleter.generator.get_polygon(shape, color = "green", fill_color = "green", link = "https://pl.wikipedia.org/wiki/Pozna%C5%84")) + outfile.write(leafleter.generator.get_polygon(shape, color="green", fill_color="green", link="https://pl.wikipedia.org/wiki/Pozna%C5%84")) outfile.write(leafleter.generator.get_html_page_suffix()) atp_data_by_spider = {} @@ -72,13 +75,12 @@ def test_area_run(): if len(gathered) > 0: atp_data_by_spider[atp_code] = gathered - general_area = "europe/poland" osm_data = [] for entry in matcher.load_geofabrik(general_area, config.cache_folder()): - if entry['center']['lat'] > area['min_lat'] and entry['center']['lat'] < area['max_lat']: - if entry['center']['lon'] > area['min_lon'] and entry['center']['lon'] < area['max_lon']: - osm_data.append(entry) + if entry['center']['lat'] > area['min_lat'] and entry['center']['lat'] < area['max_lat']: + if entry['center']['lon'] > area['min_lon'] and entry['center']['lon'] < area['max_lon']: + osm_data.append(entry) print(len(atp_data_by_spider)) print(len(osm_data)) @@ -100,6 +102,7 @@ def test_area_run(): print(output_file) + def check_is_any_graticule_having_margin_greater_than_entire_graticule(): for lat in range(-89, 89): for lon in range(-180, 180): @@ -110,17 +113,17 @@ def check_is_any_graticule_having_margin_greater_than_entire_graticule(): distance_for_lat_degree_alt = shared.calculate_distance( {'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1}, {'lat': tested_location['lat'] + 2, 'lon': tested_location['lon'] + 1} - ) + ) distance_for_lat_degree_alt_alt = shared.calculate_distance( {'lat': tested_location['lat'] + 1, 'lon': tested_location['lon']}, {'lat': tested_location['lat'] + 2, 'lon': tested_location['lon']} - ) + ) print("expected zero, maybe espilon changes", distance_for_lat_degree_alt - distance_for_lat_degree) print("expected zero, maybe espilon changes", distance_for_lat_degree_alt_alt - distance_for_lat_degree) distance_for_lon_degree_alt = shared.calculate_distance( {'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1}, {'lat': tested_location['lat'] + 1, 'lon': tested_location['lon'] + 1} - ) + ) print("expected meaningful changes", distance_for_lon_degree_alt - distance_for_lon_degree) margin_in_kilometers = config.maximum_missing_shop_distance_in_kilometers() @@ -133,4 +136,5 @@ def check_is_any_graticule_having_margin_greater_than_entire_graticule(): raise break + main() diff --git a/7_experimental_taginfo_tag_lister.py b/7_experimental_taginfo_tag_lister.py index 723c41b..975dc32 100644 --- a/7_experimental_taginfo_tag_lister.py +++ b/7_experimental_taginfo_tag_lister.py @@ -33,7 +33,7 @@ def main(): except FileNotFoundError as e: print(e) pass - #TODO skip freeform/valid ones + # TODO skip freeform/valid ones for key, values in used_tags.items(): if tag_knowledge.is_freeform_key(key): print(key, "=", "*") diff --git a/distance_distribution.py b/distance_distribution.py index fea3964..d6735e1 100644 --- a/distance_distribution.py +++ b/distance_distribution.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import os + class MatchDistanceDestributionReportCreator: def __init__(self, identifier, area_name): self.identifier = identifier @@ -29,7 +30,7 @@ class MatchDistanceDestributionReportCreator: plt.rcParams["figure.figsize"] = [10, 10] # https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html # see 02 file for more investigation - plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice + plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice plt.grid(True) plt.clf() plt.xlim(0, 1200) @@ -39,7 +40,7 @@ class MatchDistanceDestributionReportCreator: plt.rcParams["figure.figsize"] = [10, 10] # https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html # see 02 file for more investigation - plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice + plt.style.use('fivethirtyeight') # affects all charts, 'seaborn-v0_8-whitegrid' is also nice plt.grid(True) plt.clf() plt.xlim(0, 300) diff --git a/link_scan_worker.py b/link_scan_worker.py index d6e856e..912ba2e 100644 --- a/link_scan_worker.py +++ b/link_scan_worker.py @@ -4,6 +4,7 @@ import datetime import time config = __import__("0_config") + def scan_eligible(grab_bag, scanner): while True: any_scanned = False @@ -18,6 +19,7 @@ def scan_eligible(grab_bag, scanner): if any_scanned == False: return + def main(): wait_between_the_same_domain_minutes = 5 grab_bag = {} @@ -38,5 +40,6 @@ def main(): scan_eligible(grab_bag, scanner) time.sleep(10) + if __name__ == "__main__": main() diff --git a/nominatim.py b/nominatim.py index 0dbb60a..c796c82 100644 --- a/nominatim.py +++ b/nominatim.py @@ -6,24 +6,27 @@ import re import shutil config = __import__("0_config") + def cache_path(): return 'nominatim_cache' + # Initialize disk cache nominatim_cache = diskcache.Cache(cache_path()) + def drop_extra_detail_blocking_nominatim(value): # patch nominatim bug where inclusion of apartment code breaks search # https://github.com/osm-search/Nominatim/issues/145#issuecomment-2143549199 # see https://pythex.org/ for testing - value = re.sub(r'/\d+([a-zA-Z])?', '', value) # turns 178/12 into 178 - value = re.sub(r'(,|, |)lok\..*', '', value, flags=re.IGNORECASE) # "lokal" is Polish for "unit" - value = re.sub(r'(,|, |)LOK .*', '', value) # "lokal" is Polish for "unit" - value = re.sub(r'(,|, |)lokal .*', '', value) # "lokal" is Polish for "unit" - value = re.sub(r'(,|, |)lok .*', '', value) # "lokal" is Polish for "unit" - value = re.sub(r'(,|, |)lok.*', '', value) # "lokal" is Polish for "unit" - value = re.sub(r'(,|, |)LU.*', '', value) # "lokal użytkowy" is Polish legalese for "unit" - value = re.sub(r'(,|, |)Lu.*', '', value) # "lokal użytkowy" is Polish legalese for "unit" + value = re.sub(r'/\d+([a-zA-Z])?', '', value) # turns 178/12 into 178 + value = re.sub(r'(,|, |)lok\..*', '', value, flags=re.IGNORECASE) # "lokal" is Polish for "unit" + value = re.sub(r'(,|, |)LOK .*', '', value) # "lokal" is Polish for "unit" + value = re.sub(r'(,|, |)lokal .*', '', value) # "lokal" is Polish for "unit" + value = re.sub(r'(,|, |)lok .*', '', value) # "lokal" is Polish for "unit" + value = re.sub(r'(,|, |)lok.*', '', value) # "lokal" is Polish for "unit" + value = re.sub(r'(,|, |)LU.*', '', value) # "lokal użytkowy" is Polish legalese for "unit" + value = re.sub(r'(,|, |)Lu.*', '', value) # "lokal użytkowy" is Polish legalese for "unit" value = re.sub(r'(,|, |)suite .*', '', value, flags=re.IGNORECASE) @@ -34,6 +37,7 @@ def drop_extra_detail_blocking_nominatim(value): value = re.sub(r'(,|, |)unit .*', '', value, flags=re.IGNORECASE) return value + def nominatim_queries(tags, debug=False): address_tag_groups = [ ['addr:country', 'addr:city', 'addr:street', 'addr:housenumber'], @@ -67,7 +71,7 @@ def nominatim_queries(tags, debug=False): if key in ["addr:street_address", 'addr:street', 'addr:full']: # see https://github.com/osm-search/Nominatim/issues/87 value = re.sub(r'ul\. ?', '', value, flags=re.IGNORECASE) - value = re.sub(r'( |$)ul ', ' ', value, flags=re.IGNORECASE) # "ul Żabia" + value = re.sub(r'( |$)ul ', ' ', value, flags=re.IGNORECASE) # "ul Żabia" if key in ["addr:street_address", 'addr:full']: value = drop_extra_detail_blocking_nominatim(value) query += value @@ -76,6 +80,7 @@ def nominatim_queries(tags, debug=False): print(group) yield query + def location_given_tags_cache_only(tags): """ True: matches @@ -87,21 +92,22 @@ def location_given_tags_cache_only(tags): if query not in nominatim_cache: with open(config.nominatim_requests_missing_from_cache(), 'a') as outfile: outfile.write(query+"\n") - return -1 # maybe transformed query would give better result? - # should not check further ones + return -1 # maybe transformed query would give better result? + # should not check further ones else: response = nominatim_cache[query] if len(response) >= 1: return response return None + def location_given_tags(tags, debug_identifier): for query in nominatim_queries(tags): response = query_nominatim(query) if len(response) >= 1: return response - atp_code = debug_identifier # TODO handle this + atp_code = debug_identifier # TODO handle this if config.is_failed_geocoding_worth_mentioning(atp_code): print() print() @@ -119,6 +125,7 @@ def location_given_tags(tags, debug_identifier): print() return None + def is_location_matching_tags(tags, center, spider, cache_only=False): """ True: matches @@ -135,6 +142,7 @@ def is_location_matching_tags(tags, center, spider, cache_only=False): return response return are_locations_matching(tags, response[0], center) + def are_locations_matching(tags, location, center): distance = shared.calculate_distance(center, location) if distance > config.missing_shop_distance_in_kilometers_for_specific_case(tags): @@ -142,10 +150,12 @@ def are_locations_matching(tags, location, center): else: return True + def get_free_space_in_mb(path): total, used, free = shutil.disk_usage(path) return free / 1024 / 1024 + def query_nominatim(query): # Check if the response is in the cache if query in nominatim_cache: @@ -221,6 +231,7 @@ def query_nominatim(query): else: response.raise_for_status() + # Example usage # gptchat generated if __name__ == '__main__': diff --git a/nominatim_worker.py b/nominatim_worker.py index 72199c0..d904980 100644 --- a/nominatim_worker.py +++ b/nominatim_worker.py @@ -1,10 +1,12 @@ import nominatim config = __import__("0_config") + def main(): with open(config.nominatim_requests_missing_from_cache()) as fp: for query in fp: nominatim.query_nominatim(query.strip()) + if __name__ == "__main__": main() diff --git a/qa.py b/qa.py index 4adc630..9445917 100644 --- a/qa.py +++ b/qa.py @@ -1,8 +1,9 @@ -config = __import__("0_config") -import shops -import rich -import phonenumbers import datetime +import phonenumbers +import rich +import shops +config = __import__("0_config") + def remove_bad_data(data, atp_code): """ @@ -94,6 +95,7 @@ def remove_bad_data(data, atp_code): del data[key] return data + def is_empty_value(key, value, atp_code): if value.lower() in ["undefined", "b/n", "---", "none", "n/a"]: if config.is_null_specified_as_text_worth_mentioning(atp_code): @@ -114,11 +116,12 @@ def is_empty_value(key, value, atp_code): return True return False + def handle_ref_tag(data, atp_code): if atp_code in ['paczkomat_inpost_pl', 'allegro_one_box_pl']: - return data # actual ref + return data # actual ref elif atp_code in ['credit_agricole_pl']: - del data["ref"] # synthethic ref created by ATP + del data["ref"] # synthethic ref created by ATP elif "ref" in data: # https://github.com/alltheplaces/alltheplaces/blob/master/DATA_FORMAT.md describe `ref` and I am a bit confused # > A unique identifier for this feature inside this spider. The code that generates the output will remove duplicates based on the value of this key. @@ -137,6 +140,7 @@ def handle_ref_tag(data, atp_code): del data["ref"] return data + def remove_bad_phone_data(data, atp_code): if 'phone' in data: if data['phone'].replace(" ", "").startswith("+443"): @@ -157,6 +161,7 @@ def remove_bad_phone_data(data, atp_code): del data['phone'] return data + def is_valid_phone_tag(phone_tag): if ";" not in phone_tag: return is_valid_phone_number(phone_tag) @@ -166,13 +171,14 @@ def is_valid_phone_tag(phone_tag): return False return True + def is_valid_phone_number(phone): - if phone in [ - '+4800000000', # https://github.com/alltheplaces/alltheplaces/issues/8633 - ]: - return False - try: - parsed = phonenumbers.parse(phone, None) - return phonenumbers.is_valid_number(parsed) - except phonenumbers.phonenumberutil.NumberParseException: - return False + if phone in [ + '+4800000000', # https://github.com/alltheplaces/alltheplaces/issues/8633 + ]: + return False + try: + parsed = phonenumbers.parse(phone, None) + return phonenumbers.is_valid_number(parsed) + except phonenumbers.phonenumberutil.NumberParseException: + return False diff --git a/run.py b/run.py index 72c2f67..ddfffce 100644 --- a/run.py +++ b/run.py @@ -5,6 +5,7 @@ obtain_atp_data = __import__("2_obtain_atp_data") matcher = __import__("3_matcher") show_data = __import__("4_show_data") + def main(): # TODO: test dependencies on fresh OS # see readme for instructions how to install dependencies @@ -27,6 +28,7 @@ def main(): # maps listing various missing data - shops, tags, and of various wrong data (shop in OSM not in ATP and so on) show_data.main() + if __name__ == "__main__": main() diff --git a/serializing.py b/serializing.py index 89ed011..c952a43 100644 --- a/serializing.py +++ b/serializing.py @@ -2,6 +2,7 @@ import base64 import json import csv + class Match: def __init__(self, atp_center, atp_tags, osm_match_center, osm_match_tags, osm_link, match_distance, all_very_good_matches): self.atp_center = atp_center @@ -11,16 +12,18 @@ class Match: self.osm_link = osm_link self.match_distance = match_distance self.all_very_good_matches = all_very_good_matches + def __str__(self): return "Match(" + str(self.atp_center) + ',' + str(self.atp_tags) + ',' + str(self.osm_match_center) + ',' + str(self.osm_match_tags) + ',' + str(self.osm_link) + ',' + str(self.match_distance) + ',' + str(self.all_very_good_matches) + ")" + def save_list_of_matches_to_csv(filepath, data): with open(filepath, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['atp_lat', 'atp_lon', 'atp_tags_dict_in_base64', 'osm_lat', 'osm_lon', 'osm_tags_dict_in_base64', 'osm_link', 'match_distance', 'all_very_good_matches']) for entry in data: if entry.match_distance == None: - writer.writerow([entry.atp_center['lat'], entry.atp_center['lon'], encode_to_base64_via_json(entry.atp_tags),"","","","","", ""]) + writer.writerow([entry.atp_center['lat'], entry.atp_center['lon'], encode_to_base64_via_json(entry.atp_tags), "", "", "", "", "", ""]) else: writer.writerow([ entry.atp_center['lat'], @@ -32,7 +35,8 @@ def save_list_of_matches_to_csv(filepath, data): entry.osm_link, entry.match_distance, encode_to_base64_via_json(entry.all_very_good_matches) - ]) + ]) + def load_list_of_matches_from_csv(filepath): try: @@ -52,7 +56,7 @@ def load_list_of_matches_from_csv(filepath): osm_match_center = {'lat': float(row[3]), 'lon': float(row[4])} osm_match_tags = decode_from_base64_via_json(row[5]) for key, value in osm_match_tags.items(): - osm_match_tags[key] = str(value) # TODO - review saving code, this should not be needed + osm_match_tags[key] = str(value) # TODO - review saving code, this should not be needed osm_link = row[6] match_distance = float(row[7]) all_very_good_matches = decode_from_base64_via_json(row[8]) @@ -62,7 +66,9 @@ def load_list_of_matches_from_csv(filepath): print(filepath) raise -#gptchat generated +# gptchat generated + + def encode_to_base64_via_json(input_dict): # Convert the dictionary to a JSON string json_str = json.dumps(input_dict) @@ -74,7 +80,9 @@ def encode_to_base64_via_json(input_dict): base64_str = base64_bytes.decode('utf-8') return base64_str -#gptchat generated +# gptchat generated + + def decode_from_base64_via_json(base64_str): # Decode the Base64 string to bytes base64_bytes = base64_str.encode('utf-8') @@ -86,18 +94,20 @@ def decode_from_base64_via_json(base64_str): output_dict = json.loads(json_str) return output_dict + def generate_geojson_structure(dataset): - geojson_data = {"type": "FeatureCollection","features": []} + geojson_data = {"type": "FeatureCollection", "features": []} for atp in dataset: geojson_data['features'].append({"type": "Feature", - "geometry": { - "type": "Point", - "coordinates": [atp.atp_center['lon'], atp.atp_center['lat']] - }, - "properties": atp.atp_tags - }) + "geometry": { + "type": "Point", + "coordinates": [atp.atp_center['lon'], atp.atp_center['lat']] + }, + "properties": atp.atp_tags + }) return geojson_data + def generate_kml_text(dataset): geojson_data = generate_geojson_structure(dataset) returned = """<?xml version="1.0" encoding="UTF-8"?> diff --git a/shared.py b/shared.py index d9ff33e..29e0b3b 100644 --- a/shared.py +++ b/shared.py @@ -1,8 +1,10 @@ import geopy.distance + def link_to_point_in_osm(lat, lon): return 'https://www.openstreetmap.org/?mlat=' + str(lat) + "&mlon=" + str(lon) + "#map=19/" + str(lat) + '/' + str(lon) + def calculate_distance(point_a, point_b): # https://github.com/geopy/geopy?tab=readme-ov-file#measuring-distance coords_1 = (point_a['lat'], point_a['lon']) diff --git a/spatial_index.py b/spatial_index.py index 4a637bb..918d497 100644 --- a/spatial_index.py +++ b/spatial_index.py @@ -14,4 +14,3 @@ class SpatialIndex: # sort by longitude # select quickly by longitude, leaving unlimited for latitude - diff --git a/test_display_website.py b/test_display_website.py index 1180358..70234fe 100644 --- a/test_display_website.py +++ b/test_display_website.py @@ -1,9 +1,10 @@ +import distance_distribution +import url_checker +import leafleter +import serializing import unittest show_data = __import__("4_show_data") -import serializing -import leafleter -import url_checker -import distance_distribution + class IsCodeCompletelyCrashingSmoketests(unittest.TestCase): def test_rough_code_validity(self): @@ -49,6 +50,7 @@ class IsCodeCompletelyCrashingSmoketests(unittest.TestCase): for file in show_data.iterate_over_output_files('dummy_atp_code'): pass + class TagListFormattingTests(unittest.TestCase): def test_escaping_newlines(self): self.assertEqual(show_data.escape_html("ajaj\naaaa"), "ajaj<br>aaaa") @@ -59,6 +61,7 @@ class TagListFormattingTests(unittest.TestCase): def test_tag_list_generation_newline_in_tags_escape(self): self.assertEqual("aaaa<br>bbb" in show_data.tag_list_to_html({"description": "aaaa\nbbb"}), True) + class PhoneSuggestingTests(unittest.TestCase): def test_accept_normal_phone(self): add_tags_from_atp = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy identifier for tests', 'dummy area name') @@ -88,6 +91,7 @@ class PhoneSuggestingTests(unittest.TestCase): creator = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy_atp_code', 'dummy area name') self.assertEqual(creator.is_phone_eligible(match), False) + class WebsiteSuggestingTests(unittest.TestCase): def test_accept_normal_website(self): add_tags_from_atp = show_data.ATPGivesTagsReportCreator(url_checker.URLChecker(), 'dummy identifier for tests', 'dummy area name') diff --git a/test_general_smoke_test.py b/test_general_smoke_test.py index ece0c89..d29338b 100644 --- a/test_general_smoke_test.py +++ b/test_general_smoke_test.py @@ -3,6 +3,7 @@ import link_scan_worker import run import unittest + class SmokeTest(unittest.TestCase): def test_math(self): self.assertEqual(2+2, 4) diff --git a/test_matching_logic.py b/test_matching_logic.py index 9caf9ba..37a3a62 100644 --- a/test_matching_logic.py +++ b/test_matching_logic.py @@ -1,7 +1,8 @@ +import serializing import unittest matcher = __import__("3_matcher") config = __import__("0_config") -import serializing + class RealityTests(unittest.TestCase): def test_match_on_exact_match(self): @@ -59,7 +60,6 @@ class RealityTests(unittest.TestCase): matches = matcher.get_matches(osm_data, atp_data) self.assertEqual(matches[0].match_distance, None) - def test_accept_matches_for_ice_cream_synonyms(self): atp_data = [self.package_tags_into_mock({'brand': "Titan", 'amenity': 'ice_cream'})] osm_data = [self.package_tags_into_mock({'brand': "Titan", 'shop': 'ice_cream'})] @@ -119,4 +119,4 @@ class RealityTests(unittest.TestCase): matches = matcher.get_matches(osm_data, atp_data) self.assertEqual(matches[0].match_distance, 0) - #TODO: how to handle shop=yes shop=vacant + # TODO: how to handle shop=yes shop=vacant diff --git a/test_processing.py b/test_processing.py index 808c2a2..56ad49a 100644 --- a/test_processing.py +++ b/test_processing.py @@ -1,6 +1,7 @@ import unittest import qa + class RealityTests(unittest.TestCase): def test_mathworks(self): self.assertEqual(2 + 1, 3) diff --git a/test_spatial_index.py b/test_spatial_index.py index eca6216..b143701 100644 --- a/test_spatial_index.py +++ b/test_spatial_index.py @@ -1,6 +1,7 @@ import unittest import spatial_index + class Tests(unittest.TestCase): def test_basic_match_for_single_entry(self): data = [ @@ -96,7 +97,7 @@ class Tests(unittest.TestCase): if entry["tags"] not in matches: matches[entry["tags"]] = 0 matches[entry["tags"]] += 1 - self.assertEqual(matches, {4: 1, 5:1, 6:1, 7:1}) + self.assertEqual(matches, {4: 1, 5: 1, 6: 1, 7: 1}) def test_basic_match_for_all_entries_except_first(self): data = [ @@ -120,8 +121,7 @@ class Tests(unittest.TestCase): if entry["tags"] not in matches: matches[entry["tags"]] = 0 matches[entry["tags"]] += 1 - self.assertEqual(matches, {4: 1, 5:1, 6:1, 7:1}) - + self.assertEqual(matches, {4: 1, 5: 1, 6: 1, 7: 1}) def test_basic_match_for_all_entries_except_last(self): data = [ @@ -144,4 +144,4 @@ class Tests(unittest.TestCase): if entry["tags"] not in matches: matches[entry["tags"]] = 0 matches[entry["tags"]] += 1 - self.assertEqual(matches, {4: 1, 5:1, 6:1}) + self.assertEqual(matches, {4: 1, 5: 1, 6: 1}) diff --git a/url_checker.py b/url_checker.py index a91ce1d..6b5b7fc 100644 --- a/url_checker.py +++ b/url_checker.py @@ -6,6 +6,7 @@ import shutil import time config = __import__("0_config") + class URLChecker(): def __init__(self): """ @@ -15,7 +16,7 @@ class URLChecker(): that later should have been disposed but were not """ self.url_check_cache = diskcache.Cache(self.cache_path()) - urllib3.disable_warnings() # silences complaints about unverified requests via HTTPS + urllib3.disable_warnings() # silences complaints about unverified requests via HTTPS # this is done to ignore complaints about "verify=False" in requests.get # this is not so terrible as I only check is website up # see https://stackoverflow.com/questions/78855740/starfield-ca-not-recoggnised-by-requests-package @@ -89,7 +90,7 @@ class URLChecker(): # https://salony.orange.pl/pl/orange-jastrz%C4%99bie-zdr%C3%B3j-galeria-zdr%C3%B3j-26882 pass elif self.is_difference_limited_to_slash_at_end(atp_value, atp_after_redirect): - pass # just adding trailing / is not worth raising an alarm... I think? + pass # just adding trailing / is not worth raising an alarm... I think? else: self.consider_logging_that_atp_link_redirects(tested_key, atp_value, atp) return False @@ -110,13 +111,13 @@ class URLChecker(): if link_a[-1] == "/": link_a = link_a[:-1] if link_b[-1] == "/": - link_b =link_b[:-1] + link_b = link_b[:-1] return link_a == link_b def consider_logging_that_atp_link_was_rejected(self, tested_key, atp_value, atp): if atp.atp_tags['@spider'] not in [ - 'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415 - 'true_value_us', # see above + 'aldi_sud_de', # https://github.com/alltheplaces/alltheplaces/issues/9415 + 'true_value_us', # see above ]: pass #do not log problems as long as above issues are not fixed @@ -125,8 +126,8 @@ class URLChecker(): def consider_logging_that_atp_link_redirects(self, tested_key, atp_value, atp): if atp.atp_tags["@spider"] not in [ - 'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409 - 'bevmo_us', # https://github.com/alltheplaces/alltheplaces/issues/9493 + 'agata_meble_pl', # https://github.com/alltheplaces/alltheplaces/issues/9409 + 'bevmo_us', # https://github.com/alltheplaces/alltheplaces/issues/9493 ]: pass #do not log problems as long as above issues are not fixed @@ -290,7 +291,7 @@ class URLChecker(): 'sobeys.ca', 'zambrero.com', 'zambrero.com.au' - ]: + ]: # handles also broken such as # website = ps://www.biedronka.pl for protocol in ["", "http://", "https://", "ps://"]: @@ -302,7 +303,7 @@ class URLChecker(): 'https://www.circlek.pl/wyszukaj-stacje', 'http://www.statoil.pl', 'Biedronka.PL', - 'https://www.aldi-sued.de/de/homepage.html', # seems to be added by some ATP? + 'https://www.aldi-sued.de/de/homepage.html', # seems to be added by some ATP? 'https://allegro.pl/kampania/one/znajdz-nas', 'https://allegro.pl/kampania/one', 'https://www.castorama.pl', @@ -386,11 +387,10 @@ class URLChecker(): if self.get_free_space_in_mb(self.cache_path()) < 400: raise Exception("running out of free space on drive") - print(link, reason) try: headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', } # NOTE: SSL verification checks are disabled # to keep https://aviastacjapaliw.pl/stacje/avia-protasy/ working diff --git a/url_checker_test.py b/url_checker_test.py index 6d04f5f..7c790a5 100644 --- a/url_checker_test.py +++ b/url_checker_test.py @@ -1,6 +1,7 @@ import unittest import url_checker + class LinkCheckingTests(unittest.TestCase): def test_link_rejector_rejecting_known_bad(self): test = url_checker.URLChecker()