mirror of
https://codeberg.org/matkoniecz/list_how_openstreetmap_can_be_improved_with_alltheplaces_data.git
synced 2025-05-14 05:33:09 +02:00
Compare commits
10 commits
f5bf8500eb
...
ce1844befe
Author | SHA1 | Date | |
---|---|---|---|
|
ce1844befe | ||
|
388d9a3333 | ||
|
c6727b19fb | ||
|
7362c1aff8 | ||
|
b447d1b501 | ||
|
81948b4725 | ||
|
cb76e93d8e | ||
|
1545e90fd9 | ||
|
91670013ae | ||
|
9e67921a71 |
6 changed files with 49 additions and 33 deletions
|
@ -1054,8 +1054,6 @@ def name_from_tags(tags):
|
|||
name = tags.get("brand", None)
|
||||
if name == None:
|
||||
name = tags.get("short_name", None)
|
||||
if name == None:
|
||||
name = tags.get("operator", None)
|
||||
if name == None:
|
||||
name = str(tags)
|
||||
return name
|
||||
|
|
|
@ -34,7 +34,7 @@ def spider_codes_iterator_with_data():
|
|||
continue
|
||||
|
||||
|
||||
def all_spider_codes_iterator():
|
||||
def all_spider_codes_iterator(): # TODO rename: actually, it skips ignored ones
|
||||
for _item_path, atp_code in spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
yield atp_code
|
||||
|
||||
|
@ -42,6 +42,8 @@ def all_spider_codes_iterator():
|
|||
def spider_codes_and_filepaths_iterator_including_broken_data_ones():
|
||||
"""
|
||||
this one is not parsing .geojson files so will be faster
|
||||
|
||||
skips ignored spiders
|
||||
"""
|
||||
directory_path_with_unpacked_spider_data = obtain_atp_data.latest_atp_unpacked_folder()
|
||||
# TODO: there is no full match between spider codes and their filenames
|
||||
|
|
14
matcher.py
14
matcher.py
|
@ -19,16 +19,21 @@ def filter_osm_data_with_dict(current_osm, osm_data_tag_filter):
|
|||
returned.append(osm)
|
||||
return returned
|
||||
|
||||
def name_giving_keys():
|
||||
return ["name", "short_name", "brand", "brand:en"]
|
||||
|
||||
def is_matching_any_name_part_to_osm_tags(name_part_list, osm_tags):
|
||||
# if changing anything here, please also change this_tag_lists_match debug code
|
||||
# if changing anything here, please also change this_tag_lists_match debug code in tests
|
||||
for part in name_part_list:
|
||||
for key in ["name", "short_name", "brand", "brand:en",
|
||||
"operator", # https://www.openstreetmap.org/node/1922605509/history - TODO, remove, see https://community.openstreetmap.org/t/brand-poczta-polska-propozycja-automatycznej-edycji-kasowanie-operator-dodanego-na-slepo-przez-nsi/116091
|
||||
]:
|
||||
name_giving_key_present = False
|
||||
for key in name_giving_keys():
|
||||
if key in osm_tags:
|
||||
name_giving_key_present = True
|
||||
if matching_name_part(part, osm_tags[key]):
|
||||
return True
|
||||
if name_giving_key_present == False:
|
||||
return True # nameless objects are matching, as matcher is supposed to be eager
|
||||
# (we want false positives rather false negatives at that stage)
|
||||
return False
|
||||
|
||||
|
||||
|
@ -170,7 +175,6 @@ def get_matches(osm_data, atp_data):
|
|||
# TODO: handle nearby objects with matching feature type or vacant ones
|
||||
filtered_osm = filter_with_fuzzy_name_match(osm_data, filter_names)
|
||||
osm_index = spatial_index.SpatialIndex(filtered_osm)
|
||||
#print("filtering reduced entry count to", len(filtered_osm), "candidates based on names, now checking", len(atp_data), "ATP candidates by distance")
|
||||
for atp in atp_data:
|
||||
distance_scan_in_kilometers = config.maximum_missing_shop_distance_in_kilometers()
|
||||
best_match = None
|
||||
|
|
|
@ -84,9 +84,8 @@ class MismatchingNameReportCreator:
|
|||
print()
|
||||
|
||||
def is_there_clear_name_match(self, entry):
|
||||
# TODO get rid of operator, honestly it is a hack for Poczta Polska
|
||||
atp_name_claims = {entry.atp_tags.get("name"), entry.atp_tags.get("brand"), entry.atp_tags.get("short_name"), entry.atp_tags.get("operator"), entry.atp_tags.get("atp_listing_name")}
|
||||
osm_name_claims = {entry.osm_match_tags.get("name"), entry.osm_match_tags.get("brand"), entry.osm_match_tags.get("short_name"), entry.atp_tags.get("operator")}
|
||||
atp_name_claims = {entry.atp_tags.get("name"), entry.atp_tags.get("brand"), entry.atp_tags.get("short_name"), entry.atp_tags.get("atp_listing_name")}
|
||||
osm_name_claims = {entry.osm_match_tags.get("name"), entry.osm_match_tags.get("brand"), entry.osm_match_tags.get("short_name")}
|
||||
atp_osm_full_name_match = atp_name_claims & osm_name_claims - {None}
|
||||
return len(atp_osm_full_name_match) > 0
|
||||
|
||||
|
|
|
@ -47,10 +47,6 @@ class MatchingTests(unittest.TestCase):
|
|||
name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'denns b'}}])
|
||||
self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'name': "lebioda"}), False)
|
||||
|
||||
def test_match_between_operator_and_brand(self):
|
||||
name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'Poczta Polska'}}])
|
||||
self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'name': "FUP Kraków 28", "operator": "Poczta Polska"}), True)
|
||||
|
||||
def test_match_on_short_names(self):
|
||||
name_part_list = matcher.get_filter_names_from_atp_dataset([{'tags': {'brand': 'GAP'}}])
|
||||
self.assertEqual(matcher.is_matching_any_name_part_to_osm_tags(name_part_list, {'brand': "GAP"}), True)
|
||||
|
@ -89,6 +85,8 @@ class MatchingTests(unittest.TestCase):
|
|||
osm_data = [MatchingTests.package_tags_into_mock(osm_tags)]
|
||||
matches = matcher.get_matches(osm_data, atp_data)
|
||||
if debug:
|
||||
# this recreates matcher.get_matches but has some extra logging that is supposed to make deubugging easier
|
||||
# when test fail for some reason
|
||||
print()
|
||||
# based on matcher.get_matches internals
|
||||
filter_names = matcher.get_filter_names_from_atp_dataset(atp_data)
|
||||
|
@ -96,13 +94,15 @@ class MatchingTests(unittest.TestCase):
|
|||
print("claimed name match", matcher.is_matching_any_name_part_to_osm_tags(filter_names, osm_tags))
|
||||
# replicates matcher.is_matching_any_name_part_to_osm_tags
|
||||
name_part_list = filter_names
|
||||
name_giving_key_present = False
|
||||
for part in name_part_list:
|
||||
for key in ["name", "brand", "brand:en",
|
||||
"operator",
|
||||
]:
|
||||
for key in matcher.name_giving_keys():
|
||||
if key in osm_tags:
|
||||
name_giving_key_present = True
|
||||
if matcher.matching_name_part(part, osm_tags[key]):
|
||||
print(part, "matches to osm_tags[key] with value", osm_tags[key], "for key", key)
|
||||
if name_giving_key_present == False:
|
||||
print("matched as nameless one")
|
||||
|
||||
print(matcher.filter_with_fuzzy_name_match(osm_data, filter_names))
|
||||
print(str(matches[0]))
|
||||
|
@ -114,6 +114,12 @@ class MatchingTests(unittest.TestCase):
|
|||
def test_reject_mismatches_based_on_object_type(self):
|
||||
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'shop': 'pastry'}, {'brand': "Titan", 'amenity': 'fuel'}), False)
|
||||
|
||||
def test_reject_mismatches_based_on_object_name(self):
|
||||
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'shop': 'pastry'}, {'brand': "Sauron", 'shop': 'pastry'}), False)
|
||||
|
||||
def test_accept_matches_for_nameless_ones(self):
|
||||
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'amenity': 'ice_cream'}, {'shop': 'ice_cream'}), True)
|
||||
|
||||
def test_accept_matches_for_ice_cream_synonyms(self):
|
||||
self.assertEqual(self.this_tag_lists_match({'brand': "Titan", 'amenity': 'ice_cream'}, {'brand': "Titan", 'shop': 'ice_cream'}), True)
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import os
|
|||
import json
|
||||
import compute_local_time
|
||||
import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
config = __import__("0_config")
|
||||
obtain_atp_data = __import__("2_obtain_atp_data")
|
||||
|
@ -13,7 +14,6 @@ obtain_atp_data = __import__("2_obtain_atp_data")
|
|||
def download():
|
||||
for entry in obtain_atp_data.get_atp_history()[::-1]:
|
||||
run_id = entry['run_id']
|
||||
print(run_id)
|
||||
obtain_atp_data.download_specific_atp_run(run_id)
|
||||
|
||||
|
||||
|
@ -35,6 +35,7 @@ def view():
|
|||
# for zabka see
|
||||
# https://github.com/alltheplaces/alltheplaces/issues/5888 https://github.com/alltheplaces/alltheplaces/pull/5586
|
||||
atp_code = "zabka_pl"
|
||||
collected_for_given_weekday = defaultdict(list)
|
||||
for entry in obtain_atp_data.get_atp_history()[::-1]:
|
||||
run_id = entry['run_id']
|
||||
run_metadata = obtain_atp_data.get_atp_specific_spider_run_metadata(run_id, atp_code)
|
||||
|
@ -42,26 +43,19 @@ def view():
|
|||
print(run_id)
|
||||
folder = obtain_atp_data.atp_unpacked_folder(run_id)
|
||||
source_atp_filepath = folder + atp_code + ".geojson"
|
||||
if os.path.isfile(source_atp_filepath) == False:
|
||||
print("file", source_atp_filepath, "is missing")
|
||||
continue
|
||||
with open(source_atp_filepath) as file:
|
||||
try:
|
||||
data = json.load(file)
|
||||
rich.print("spider:collection_time", data['dataset_attributes'].get("spider:collection_time"))
|
||||
for feature in data['features']:
|
||||
if relevant_one(feature):
|
||||
# rich.print(feature['properties']['addr:city'])
|
||||
print()
|
||||
opening_hours = feature['properties']['opening_hours']
|
||||
rich.print(opening_hours)
|
||||
oh = parser.OpeningHours(opening_hours)
|
||||
oh.display()
|
||||
for day in parser.OpeningHours.DAYS_OF_WEEK:
|
||||
stringified = []
|
||||
for time_selector in oh.parsed[day]:
|
||||
stringified.append(str(time_selector)) # .from_hours, .from_minutes, .to_hours, .to_minutes
|
||||
reduced_to_canonical_string = ",".join(stringified)
|
||||
print(day, reduced_to_canonical_string)
|
||||
print(opening_hours)
|
||||
|
||||
# rich.print(feature)
|
||||
rich.print(feature['geometry']['coordinates'])
|
||||
longitude = feature['geometry']['coordinates'][0]
|
||||
latitude = feature['geometry']['coordinates'][1]
|
||||
|
||||
|
@ -71,10 +65,23 @@ def view():
|
|||
print('in local t', start_time)
|
||||
print('in local t', start_time.date())
|
||||
weekday_index = start_time.date().weekday()
|
||||
|
||||
oh = parser.OpeningHours(opening_hours)
|
||||
oh.display()
|
||||
|
||||
for offset in range(7):
|
||||
date = start_time.date() + datetime.timedelta(days=offset)
|
||||
day_code = parser.OpeningHours.day_of_week_code(weekday_index + offset)
|
||||
print(offset, date, day_code)
|
||||
|
||||
stringified = []
|
||||
for time_selector in oh.parsed[day_code]:
|
||||
stringified.append(str(time_selector)) # .from_hours, .from_minutes, .to_hours, .to_minutes
|
||||
reduced_to_canonical_string = ",".join(stringified)
|
||||
|
||||
print(offset, date, day_code, reduced_to_canonical_string)
|
||||
|
||||
collected_for_given_weekday[day_code].append({"opening_hours": reduced_to_canonical_string, "date": date})
|
||||
|
||||
print('finish_time', run_metadata['finish_time'].replace("T", " "))
|
||||
finish_time = compute_local_time.convert_utc_timestamp_string_to_local(run_metadata['finish_time'], latitude, longitude)
|
||||
print('in local ti', finish_time)
|
||||
|
@ -86,7 +93,7 @@ def view():
|
|||
|
||||
except json.decoder.JSONDecodeError as e:
|
||||
print(e)
|
||||
|
||||
rich.print(collected_for_given_weekday)
|
||||
# {"type": "Feature", "id": "y0P1sd7RSJB-6RBe-vTNFRUDZEg=", "properties": {"ref": "Z2220", "@source_uri": "https://www.zabka.pl/app/uploads/locator-store-data.json", "@spider": "zabka_pl", "name:pl": "\u017babka", "name:uk": "\u0416\u0430\u0431\u043a\u0430", "shop": "convenience", "addr:street_address": "ul. Miko\u0142ajczyka/os. Kalinowe 12c", "addr:city": "Krak\u00f3w", "addr:country": "PL", "name": "\u017babka", "website": "https://www.zabka.pl/znajdz-sklep/ID02220-krakow-ul-mikolajczyka-os-kalinowe-12c/", "opening_hours": "Mo-Sa 06:00-23:00", "brand": "\u017babka", "brand:wikidata": "Q2589061", "nsi_id": "zabka-f55ead"}, "geometry": {"type": "Point", "coordinates": [20.014811, 50.092896]}},
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue