1
0
Fork 0

miserable but working opening hours parser

This commit is contained in:
Mateusz Konieczny 2025-01-27 13:21:47 +01:00
parent 7706d3ec0f
commit bff7c10792
2 changed files with 392 additions and 0 deletions

315
opening_hours_parser.py Normal file
View file

@ -0,0 +1,315 @@
"""
This is a low-quality parser of a tiny part of opening_hours syntax
It definitely can be done better. I would appreciate pointers to better
ways to achive it in Python.
It is definitely miserably slow.
It may be enough here as ATP does not emit elaborate opening_hours syntax
So any opening hours that need them will mismatch with ATP opening hours by default
opening_hours key is described at https://wiki.openstreetmap.org/wiki/Key:opening_hours
formal specification is at https://wiki.openstreetmap.org/wiki/Key:opening_hours/specification
verification tool is at https://openingh.openstreetmap.de/evaluation_tool/?setLng=en
For supported examples see test_opening_hours_parser.py file
"""
import re
import rich
import time
class OpeningHours():
class WeekdaySelector():
def is_valid_day(self, day):
return day in ['Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa', 'Su']
@staticmethod
def next_day(day):
return {
'Mo': 'Tu',
'Tu': 'We',
'We': 'Th',
'Th': 'Fr',
'Fr': 'Sa',
'Sa': 'Su',
'Su': 'Mo',
}[day]
def __init__(self, range_from=None, range_to=None, list_of_days=None):
# example parameters
# range_from=None, range_to=None, list_of_days=['Mo', 'Su']
# range_from='Mo', range_to='Tu', list_of_days=['Fr', 'Su']
self.selected_days = {
'Mo': False,
'Tu': False,
'We': False,
'Th': False,
'Fr': False,
'Sa': False,
'Su': False,
}
if range_from != None:
if self.is_valid_day(range_from) != True:
raise Exception("`"+str(range_from) + "` is not valid") #TODO be resistant to borked data, or maybe exceptions are fine?
if self.is_valid_day(range_to) != True:
raise Exception("`"+str(range_to) + "` is not valid") #TODO be resistant to borked data, or maybe exceptions are fine?
day = range_from
self.selected_days[day] = True
while True:
day = OpeningHours.WeekdaySelector.next_day(day)
self.selected_days[day] = True
if day == range_to:
break
if list_of_days != None:
for day in list_of_days:
if self.is_valid_day(day) != True:
raise Exception("`"+str(day) + "` is not valid") #TODO be resistant to borked data, or maybe exceptions are fine?
self.selected_days[day] = True
class TimeSelector():
def __init__(self, from_hours, from_minutes, to_hours, to_minutes):
self.from_hours = int(from_hours)
self.from_minutes = int(from_minutes)
self.to_hours = int(to_hours)
self.to_minutes = int(to_minutes)
def from_time(self):
return self.from_hours * 60 + self.from_minutes
def to_time(self):
return self.to_hours * 60 + self.to_minutes
def __str__(self):
return f"{self.from_hours}:{self.from_minutes}-{self.to_hours}:{self.to_minutes}"
def __rich__(self):
return self.__str__()
def __lt__(self, other):
return self.from_hours * 60 + self.from_minutes < other.from_hours * 60 + other.from_minutes
def __init__(self, opening_hours_string, respect_semicolon_override=True):
self.respect_semicolon_override = respect_semicolon_override # TODO remove that workaround for ATP
self.raw = opening_hours_string
print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
self.parsed = self.parse(self.raw)
print("parsed - shown below")
self.display()
print("parsed")
def parse(self, opening_hours_string):
print("trying to parse", opening_hours_string)
if '"' in self.raw:
# comment? giving up immediately
return
if opening_hours_string == "24/7":
return self.parse("Mo-Su 00:00-24:00")
ongoing_parsing_per_rule_sequence = []
for rule_sequence in opening_hours_string.split(";"):
ongoing_parsing = []
remaining_part = rule_sequence
while True:
parsed_part = self.parse_small_range_selectors(remaining_part)
if parsed_part == None:
return None
if 'time_selector_off_mode' in parsed_part:
# TODO off marker may override earlier groups! Just ignoring them is problematic Add test
remaining_part = parsed_part['remaining_part']
else:
ongoing_parsing.append({'weekdays_selector': parsed_part['weekdays_selector'], 'time_selectors': [parsed_part['time_selector']]})
remaining_part = parsed_part['remaining_part']
# either remaining_part is empty and we finished parsing of this part
#
# or what remains may look like
# ,Sa 06:00-23:00
# then it is simply another day range
# but it may also look like
# ,06:00-23:00
# then it is another time range to be applied here
#
# anything else? we give up
while True:
if remaining_part == "":
# finished parsing of this part
break
print(remaining_part)
if remaining_part[0] != ",":
print("EXPECTED COMMA, SORRY!")
return None # unexpected
remaining_part = remaining_part[1:]
print(remaining_part)
day_probe = r"^\s*(Mo|Tu|We|Th|Fr|Sa|Su)"
day_probe_match = re.search(day_probe, remaining_part)
if day_probe_match != None:
break
# ready for next round of parsing
else:
# next hours of the same day
parsed_part = self.parse_time_selector(remaining_part)
if parsed_part == None:
print("SOMETHING FAILED!")
return None
remaining_part = parsed_part['remaining_part']
# applies to the same range as previous time selector already addded to the list
ongoing_parsing[-1]['time_selectors'].append(parsed_part['time_selector'])
if remaining_part == "":
# finished parsing of this part
break
ongoing_parsing_per_rule_sequence.append(ongoing_parsing)
# decompose ranges into specific days
returned = {
'Mo': [],
'Tu': [],
'We': [],
'Th': [],
'Fr': [],
'Sa': [],
'Su': [],
}
# apply overrides caused by semicolons
rich.print(ongoing_parsing_per_rule_sequence)
for from_one_rule_sequence in ongoing_parsing_per_rule_sequence:
rich.print("&&&&&&&&&&&&&&&& from_one_rule_sequence")
for entry in from_one_rule_sequence:
for day, enabled in entry['weekdays_selector'].selected_days.items():
if enabled:
if self.respect_semicolon_override:
# note that over-midnight overhang does not reset previous days
# see say We 09:00-10:00,Tu 06:00-07:00;Tu 10:00-02:00
# ";Tu 10:00-02:00" resets Tuesday part from previous rule
# but not Wednesday part despite modifying both days
returned[day] = []
for entry in from_one_rule_sequence:
for day, enabled in entry['weekdays_selector'].selected_days.items():
if enabled:
for time_selector in entry['time_selectors']:
if time_selector.from_time() < time_selector.to_time() and time_selector.to_hours <= 23:
# regular range
returned[day].append(time_selector)
else:
# over midnight range
canonical_to_hours_for_next_day = time_selector.to_hours
if canonical_to_hours_for_next_day > 24:
canonical_to_hours_for_next_day -= 24
canonical_to_minutes_for_next_day = time_selector.to_minutes
time_selector.to_hours = 24
time_selector.to_minutes = 0
returned[day].append(time_selector)
returned[OpeningHours.WeekdaySelector.next_day(day)].append(OpeningHours.TimeSelector(from_hours=0, from_minutes=0, to_hours=canonical_to_hours_for_next_day, to_minutes=canonical_to_minutes_for_next_day))
# merge multiple redundant time selectors for each day
for day in returned.keys():
print(day)
merged_time_selectors = []
for time_selector in sorted(returned[day]):
print(time_selector)
if time_selector.from_time() == time_selector.to_time():
# drop degenerated empty range
continue
if len(merged_time_selectors) == 0:
# first one, so nothing to merge with
merged_time_selectors.append(time_selector)
elif merged_time_selectors[-1].to_time() < time_selector.from_time():
# gap between ranges
merged_time_selectors.append(time_selector)
else:
# merging!
# we sorted by start time, so new range starts later or at the same time
# so we may need to extend end time, but never start time
if time_selector.to_time() > merged_time_selectors[-1].to_time():
# lets extend!
merged_time_selectors[-1].to_hours = time_selector.to_hours
merged_time_selectors[-1].to_minutes = time_selector.to_minutes
else:
# new time_selector is within already specified range
pass
returned[day] = merged_time_selectors
return returned
def parse_time_selector(self, remaining_part):
print("searching for time_selector in", remaining_part)
time_selector_pattern = r"^\s*((off|closed)|(\d+):(\d+)\s*-\s*(\d+):(\d+))" # 10:00-18:00
time_selector = re.search(time_selector_pattern, remaining_part)
if not time_selector:
print("time_selector not found")
print()
return None
consumed_length = len(time_selector.group(0))
off_marker = time_selector.group(2)
remaining_part = remaining_part[consumed_length:]
if off_marker != None:
print("found off marker! Returning remaining_part=",remaining_part)
return {'time_selector_off_mode': None, 'remaining_part': remaining_part}
# TODO off marker may still have time range!
else:
from_hours = time_selector.group(3)
from_minutes = time_selector.group(4)
to_hours = time_selector.group(5)
to_minutes = time_selector.group(6)
print(f"Matched value: {from_hours}:{from_minutes} - {to_hours}:{to_minutes}")
parsed_time_selector = OpeningHours.TimeSelector(from_hours=from_hours, from_minutes=from_minutes, to_hours=to_hours, to_minutes=to_minutes)
print(f"Updated string: `{remaining_part}`")
print()
print()
return {'time_selector': parsed_time_selector, 'remaining_part': remaining_part}
def parse_small_range_selectors(self, remaining_part):
print("searching for weekday_selector in", remaining_part)
weekday_selector_pattern = r"^\s*(Mo|Tu|We|Th|Fr|Sa|Su)(([,|-])(Mo|Tu|We|Th|Fr|Sa|Su)|)" #Mo,Tu or Mo-Su
weekday_selector = re.search(weekday_selector_pattern, remaining_part)
if not weekday_selector:
return None
matched_value = weekday_selector.group(0)
remaining_part = remaining_part[len(matched_value):].strip()
from_day = weekday_selector.group(1)
separator = weekday_selector.group(3)
to_day = weekday_selector.group(4)
applicable_days = None
if separator == "-":
print("day range from", from_day, "to", to_day)
applicable_days = OpeningHours.WeekdaySelector(range_from=from_day, range_to=to_day, list_of_days=None)
rich.print(applicable_days.selected_days)
elif separator == None and to_day == None:
print("day listing having single day", from_day)
applicable_days = OpeningHours.WeekdaySelector(range_from=None, range_to=None, list_of_days=[from_day])
rich.print(applicable_days.selected_days)
else:
print("day listing having", from_day, "and", to_day)
applicable_days = OpeningHours.WeekdaySelector(range_from=None, range_to=None, list_of_days=[from_day, to_day])
rich.print(applicable_days.selected_days)
print(f"Updated string: `{remaining_part}`")
result = self.parse_time_selector(remaining_part)
if result == None:
return None
result['weekdays_selector'] = applicable_days
return result
def display(self):
print()
print(self.raw)
for day in ['Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa', 'Su']:
print(day)
for entry in self.parsed[day]:
print(" " , entry)
def __eq__(self, other):
print("__eq__")
if self.parsed == None:
print("Failed to parse", self.raw)
return self.raw == other.raw
self.display()
other.display()
for day in ['Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa', 'Su']:
if len(self.parsed[day]) != len(other.parsed[day]):
return False
for index in range(len(self.parsed[day])):
# both have their time selectors orderedm, so we can compare matching ones
if self.parsed[day][index].from_hours != other.parsed[day][index].from_hours:
return False
if self.parsed[day][index].from_minutes != other.parsed[day][index].from_minutes:
return False
if self.parsed[day][index].to_hours != other.parsed[day][index].to_hours:
return False
if self.parsed[day][index].to_minutes != other.parsed[day][index].to_minutes:
return False
return True
if __name__ == "__main__":
a = OpeningHours("Mo 06:00-23:00;Sa 06:00-23:00")
b = OpeningHours("Sa 06:00-23:00;Mo 06:00-23:00")
print(a == b)

View file

@ -0,0 +1,77 @@
import unittest
import opening_hours_parser as parser
class TestOpeningHoursParser(unittest.TestCase):
def test_matches_itself(self):
self.assertEqual(parser.OpeningHours("Mo-Sa 06:00-23:00"), parser.OpeningHours("Mo-Sa 06:00-23:00"))
def test_trivial_reordering_match(self):
self.assertEqual(parser.OpeningHours("Mo 06:00-23:00;Sa 06:00-23:00"), parser.OpeningHours("Sa 06:00-23:00;Mo 06:00-23:00"))
def test_simple_over_midnight(self):
self.assertEqual(parser.OpeningHours("Mo 06:00-01:00"), parser.OpeningHours("Mo 06:00-24:00;Tu 00:00-01:00"))
def test_semicolon_and_comma_separator_may_mean_the_same(self):
self.assertEqual(parser.OpeningHours("Mo-Fr 06:00-23:00;Sa 06:00-23:00"), parser.OpeningHours("Mo-Fr 06:00-23:00,Sa 06:00-23:00"))
self.assertEqual(parser.OpeningHours("Mo-Sa 06:00-23:00; Su off"), parser.OpeningHours("Mo-Sa 06:00-23:00, Su off"))
def test_semicolon_and_comma_separator_may_change_meaning_for_overnight_range_being_overriden(self):
"""
Note that
Mo-Fr 06:00-02:00;Sa 06:00-23:00
and
Mo-Fr 06:00-02:00,Sa 06:00-23:00
have a different meaning as with `;` early morning on Saturday is not open, as "Sa 06:00-23:00" rule overwrote previous one.
While with `,` union product is made and also early Saturday object is open
"""
#self.assertNotEqual(parser.OpeningHours("Mo-Fr 06:00-02:00;Sa 06:00-23:00"), parser.OpeningHours("Mo-Fr 06:00-02:00,Sa 06:00-23:00")) # TODO ENABLE
def test_day_ranges_may_be_split_in_pointless_parts(self):
self.assertEqual(parser.OpeningHours("Mo-Fr 06:00-23:00,Sa 06:00-23:00"), parser.OpeningHours("Mo-Sa 06:00-23:00"))
def test_split_overnight_time_ranges(self):
self.assertEqual(parser.OpeningHours("Mo 20:00-02:00, Tu 08:00-16:00"), parser.OpeningHours("Mo 20:00-24:00; Tu 00:00-02:00, 08:00-16:00"))
def test_day_range_vs_comma_separated_days(self):
self.assertEqual(parser.OpeningHours("Mo-Sa 06:00-23:00"), parser.OpeningHours("Mo 06:00-23:00,Tu 06:00-23:00, We 06:00-23:00, Th 06:00-23:00, Fr 06:00-23:00, Sa 06:00-23:00"))
def test_semicolon_separated_day_ranges(self):
self.assertEqual(parser.OpeningHours("Mo 06:00-23:00;Tu 06:00-23:00;We 06:00-23:00;Th 06:00-23:00;Fr 06:00-23:00;Sa 06:00-23:00"), parser.OpeningHours("Mo-Sa 06:00-23:00"))
def test_day_range_and_comma_separated_day_range(self):
self.assertEqual(parser.OpeningHours("Mo-Fr 06:00-23:00;Sa 06:00-23:00"), parser.OpeningHours("Mo-Sa 06:00-23:00"))
def test_comma_separated_time_ranges_on_the_same_day(self):
self.assertEqual(parser.OpeningHours("Mo-Sa 06:00-10:00,Mo-Sa 16:00-20:00"), parser.OpeningHours("Mo-Sa 06:00-10:00,16:00-20:00"))
@unittest.expectedFailure # TODO: implement
def test_implicit_day_range(self):
self.assertEqual(parser.OpeningHours("Mo-Su 06:00-10:00"), parser.OpeningHours("06:00-10:00"))
def test_twenty_four_seven(self):
self.assertEqual(parser.OpeningHours("Mo-Su 00:00-24:00"), parser.OpeningHours("24/7"))
def test_different_day_groupings(self):
self.assertEqual(parser.OpeningHours("Th-Fr 09:00-21:00"), parser.OpeningHours("Th,Fr 09:00-21:00"))
self.assertEqual(parser.OpeningHours("Mo-We 09:00-17:30; Th-Fr 09:00-21:00; Sa 09:00-17:00; Su 10:00-17:00"), parser.OpeningHours("Mo-We 09:00-17:30; Th,Fr 09:00-21:00; Sa 09:00-17:00; Su 10:00-17:00"))
@unittest.expectedFailure # TODO: implement
def test_list_of_days_may_be_longer_than_two(self):
self.assertEqual(parser.OpeningHours("Th-Sa 09:00-21:00"), parser.OpeningHours("Th,Fr,Sa 09:00-21:00"))
def test_noting_days_off_is_optional(self):
self.assertEqual(parser.OpeningHours("Mo-Sa 06:00-23:00; Su off"), parser.OpeningHours("Mo-Sa 06:00-23:00"))
def test_two_ways_of_writting_off(self):
self.assertEqual(parser.OpeningHours("Su off"), parser.OpeningHours("Su closed"))
def test_different_meaning_different_day_range(self):
self.assertNotEqual(parser.OpeningHours("Mo-Sa 06:00-23:00"), parser.OpeningHours("Mo-Su 06:00-23:00"))
# We 09:00-10:00,Tu 10:00-02:00
# https://openingh.openstreetmap.de/evaluation_tool/?EXP=We%2009%3A00-10%3A00%2CTu%2010%3A00-02%3A00&lat=48.7769&lon=9.1844&mode=0&DATE=1737977880000
# TODO
# We 09:00-09:00
# https://openingh.openstreetmap.de/evaluation_tool/?EXP=We%2009%3A00-09%3A00&lat=48.7769&lon=9.1844&mode=0&DATE=1737977880000
# TODO