1
0
Fork 0

WIP from dying laptop, OSM-ATP comparison for recently edited OHs

This commit is contained in:
Mateusz Konieczny 2025-04-07 11:24:42 +02:00
parent 854a8c3002
commit 481fc44b9c
8 changed files with 602 additions and 12 deletions

View file

@ -529,6 +529,10 @@ def planet_download_folder():
return cache_folder() + "planet_data/"
def changeset_list_download_folder():
return cache_folder() + "openstreetmap_all_changeset_data/"
def output_folder():
return os.getenv("OSM_ATM_MATCHER_OUTPUT_FOLDER") + "/"

View file

@ -6,21 +6,16 @@ import os
import osm_bot_abstraction_layer.util_download_file
config = __import__("0_config")
def main():
print(datetime.now().isoformat(timespec='minutes'))
def download_file_via_torrent(torrent_file_url, torrent_file_directory, torrent_file_name, file_download_folder):
ses = libtorrent.session({'listen_interfaces': '0.0.0.0:6881'})
torrent_file_directory = config.cache_folder()
filename = 'planet-latest.osm.pbf.torrent'
if os.path.isfile(torrent_file_directory + filename):
os.remove(torrent_file_directory + filename)
print(torrent_file_directory + filename)
url = 'https://planet.openstreetmap.org/pbf/planet-latest.osm.pbf.torrent'
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(url, torrent_file_directory, filename)
if os.path.isfile(torrent_file_directory + torrent_file_name):
os.remove(torrent_file_directory + torrent_file_name)
print(torrent_file_directory + torrent_file_name)
osm_bot_abstraction_layer.util_download_file.download_file_if_not_present_already(torrent_file_url, torrent_file_directory, torrent_file_name)
info = libtorrent.torrent_info(torrent_file_directory + filename)
h = ses.add_torrent({'ti': info, 'save_path': config.planet_download_folder()})
info = libtorrent.torrent_info(torrent_file_directory + torrent_file_name)
h = ses.add_torrent({'ti': info, 'save_path': file_download_folder})
s = h.status()
print('starting', s.name)
@ -41,6 +36,15 @@ def main():
time.sleep(1)
print(h.status().name, 'complete')
def main():
print(datetime.now().isoformat(timespec='minutes'))
download_file_via_torrent(
torrent_file_url='https://planet.openstreetmap.org/pbf/planet-latest.osm.pbf.torrent',
torrent_file_directory=config.cache_folder(),
torrent_file_name='planet-latest.osm.pbf.torrent',
file_download_folder=config.planet_download_folder()
)
print(datetime.now().isoformat(timespec='minutes'))
if __name__ == "__main__":

View file

@ -0,0 +1,85 @@
<?php
// obtained from https://github.com/matkoniecz/StreetComplete_usage_changeset_analysis
// assumptions
// changesets are formatted as follows:
// either (1)
// line begins with
// "<changeset" and ends with "/>"
// and it is a changeset without tags
// or (2)
// line begins with '<changeset' and ends with '">'
// tags, one in each line follows
// ends with line including '</changeset>' as sole nonwhitespace text
// applies
function value_of_key($line, $tag) {
$left_stripped = str_replace("<tag k=\"" . $tag . "\" v=\"", "", $line);
return str_replace('"/>', '', $left_stripped);
}
function quest_tag_to_identifier($line) {
return value_of_key($line, "StreetComplete:quest_type");
}
function created_by_tag_to_identifier($line) {
return value_of_key($line, "created_by");
}
// from https://www.php.net/manual/en/function.substr-compare.php
function str_begins($haystack, $needle) {
return 0 === substr_compare($haystack, $needle, 0, strlen($needle));
}
function str_ends($haystack, $needle) {
return 0 === substr_compare($haystack, $needle, -strlen($needle));
}
function contains_substr($mainStr, $str, $loc = false) {
if ($loc === false) return (strpos($mainStr, $str) !== false);
if (strlen($mainStr) < strlen($str)) return false;
if (($loc + strlen($str)) > strlen($mainStr)) return false;
return (strcmp(substr($mainStr, $loc, strlen($str)), $str) == 0);
}
function get_changes_number($changeset_header) {
if (preg_match("/num_changes=\"([0-9]+)\"/", $changeset_header, $matches)) {
return (int)$matches[1];
} else {
return 0;
}
}
function get_quest_type($changeset_header) {
if (preg_match("/v=\"([^\"]+)\"/", $changeset_header, $matches)) {
return $matches[1];
} else {
return NULL;
}
}
function get_changeset_id($changeset_header) {
if (preg_match("/ id=\"([0-9]+)\"/", $changeset_header, $matches)) {
return (int)$matches[1];
} else {
return -1;
}
}
function get_uid($changeset_header) {
if (preg_match("/ uid=\"([0-9]+)\"/", $changeset_header, $matches)) {
return (int)$matches[1];
} else {
return -1;
}
}
function get_changeset_creation_date($changeset_header) {
if (preg_match("/ created_at=\"([^\"]+)\"/", $changeset_header, $matches)) {
return $matches[1];
} else {
return -1;
}
}
?>

View file

@ -0,0 +1,112 @@
<?php
// obtained from https://github.com/matkoniecz/StreetComplete_usage_changeset_analysis
// special thanks to @Zverik for answering https://github.com/Zverik/editor-stats/issues/4
// without this I would not expect processing such data to be feasible (changeset planet file can be read line by line)!
// for assumptions being made about file format, allowing to process it without
// parsing it as an XML, see extracting_data_from_xml_line.php file
require_once('changeset_parser_extracting_data_from_xml_line.php');
function main($input_filepath, $output_filepath) {
$file = new SplFileObject($input_filepath);
$outputFile = fopen($output_filepath, "w") or die("Unable to open file!");
fwrite($outputFile, "changeset_id" . "," . "editor" . "," . "changed_objects" . "," . "quest_type" . "," . "user_id" . "\n");
$popularity = array();
// based on https://stackoverflow.com/questions/13246597/how-to-read-a-large-file-line-by-line
// Loop until we reach the end of the file.
while (!$file->eof()) {
$line = trim($file->fgets());
if ($line == "</changeset>") {
#echo $line;
#echo "end of a changeset with tags\n\n";
$changeset_header = NULL;
} elseif (str_begins($line, "<changeset")) {
if(str_ends($line, '">')) {
#echo $line;
$changeset_header = $line;
#echo "new changeset, with tags\n\n";
} else {
#echo $line;
#echo "new changeset, without tags\n\n";
}
} else {
if(str_begins($line, '<tag k="created_by"')) {
if(contains_substr($line, "StreetComplete") || contains_substr($line, "zażółć")) {
#echo $changeset_header;
#echo "\n";
#echo $line;
#echo "\n";
#echo "created by tag\n";
}
} elseif (str_begins($line, '<tag k="StreetComplete:quest_type"') || str_begins($line, '<tag k="zażółć:quest_type"')) {
#echo $line;
#echo "\n";
#echo "quest type tag";
#echo get_changes_number($changeset_header);
#echo "\n";
if(str_begins($line, '<tag k="StreetComplete:quest_type"')){
$editor = "StreetComplete";
} elseif(str_begins($line, '<tag k="zażółć:quest_type"')){
$editor = "StreetComplete";
} else {
$editor = "?";
}
$id = get_changeset_id($changeset_header);
$count = get_changes_number($changeset_header);
$type = get_quest_type($line);
$uid = get_uid($changeset_header);
fwrite($outputFile, $id . "," . $editor . "," . $count . "," . $type . "," . $uid . "\n");
$popularity = register_popularity($popularity, $type, get_changes_number($changeset_header));
#var_dump($popularity);
#echo "\n\n";
}
}
}
arsort($popularity);
foreach ($popularity as $quest_identifier => $total_edits) {
echo "$quest_identifier : $total_edits\n";
}
echo("\n");
echo("\n");
echo("\n");
echo "| QuestCode | Total modified elements |\n";
echo "| ------------- |-------------|\n";
foreach ($popularity as $quest_identifier => $total_edits) {
echo "| $quest_identifier | $total_edits |\n";
}
echo("\n");
echo("\n");
echo("\n");
echo "| QuestCode | Total modified elements |\n";
echo "| ------------- |-------------|\n";
foreach ($popularity as $quest_identifier => $total_edits) {
if ($total_edits >= 4000) {
echo "| $quest_identifier | ". (int)($total_edits/1000) . "k |\n";
} else {
echo "| $quest_identifier | $total_edits |\n";
}
}
// Unset the file to call __destruct(), closing the file handle.
$file = null;
fclose($outputFile);
}
function register_popularity($dict, $index, $number) {
if (isset($dict[$index])) {
$dict[$index] += $number;
} else {
$dict[$index] = $number;
}
return $dict;
}
main($argv[1], $argv[1])
?>

View file

@ -0,0 +1,268 @@
import csv
from collections import deque
import os
import requests
import json
import diskcache
import rich
from osm_easy_api.api import Api
from osm_easy_api.data_classes import Node, Way, Relation, Changeset, OsmChange, Action, Tags
from osm_easy_api.api.endpoints import Elements_Container
import sqlite3
import sqlite_test
import osm_bot_abstraction_layer.util_download_file
import serializing
import bz2
obtain_osm_data = __import__("1_obtain_osm_data") # move torrent download code elsewhere?
config = __import__("0_config")
CHANGESET_CACHE = diskcache.Cache(config.cache_folder() + "osm_changeset_cache", eviction_policy="none")
HISTORY_CACHE = diskcache.Cache(config.cache_folder() + "osm_history_cache", eviction_policy="none")
def create_filtered_csv(input_filename, output_filename):
total_lines = 0
saved_lines = 0
with open(input_filename, mode='r', newline='', encoding='utf-8') as infile:
with open(output_filename, mode='w', newline='', encoding='utf-8') as outfile:
reader = csv.DictReader(infile)
writer = csv.writer(outfile)
# Write header to output file
writer.writerow(['changeset_id', 'editor', 'user_id'])
for row in reader:
total_lines += 1
if row['quest_type'] == 'AddOpeningHours':
# Write the selected columns to the output file
writer.writerow([
row['changeset_id'],
row['editor'],
row['user_id']
])
saved_lines += 1
print(f"Processed {total_lines} lines in total")
print(f"Saved {saved_lines} lines to {output_filename}")
def serialize_element_list(input):
returned = []
for entry in input:
returned.append(entry.to_dict())
return json.dumps(returned, default=str, indent=3)
def object_class_from_object_name(object_name):
if object_name == "Node":
return Node
elif object_name == "Way":
return Way
elif object_name == "Relation":
return Relation
else:
raise Exception("unexpected type " + object_name)
def deserialize_element_list(serialized):
returned = []
for entry in json.loads(serialized):
object_class = object_class_from_object_name(entry['type'])
returned.append(object_class.from_dict(entry))
return returned
def elements_edited_by_changeset(api, changeset_id):
if changeset_id in CHANGESET_CACHE:
return deserialize_element_list(CHANGESET_CACHE[changeset_id])
element_list = download_elements_edited_by_changeset(api, changeset_id)
returned = serialize_element_list(element_list)
CHANGESET_CACHE[changeset_id] = returned
return element_list
def download_elements_edited_by_changeset(api, changeset_id):
print("downloading changeset", changeset_id)
element_list = []
for action in api.changeset.download(changeset_id):
if action[0] != Action.MODIFY and action[0] != Action.DELETE and action[0] != Action.CREATE:
print("unexpected action type", action)
raise
element = action[1]
element_list.append(element)
#rich.print(action)
return element_list
def history_info(api, object_type, object_id):
identifier = object_type + "_" + str(object_id)
if identifier in HISTORY_CACHE:
return deserialize_element_list(HISTORY_CACHE[identifier])
osm_type = object_class_from_object_name(object_type)
container = Elements_Container(api)
print("downloading history of", object_type, object_id)
history = container.history(osm_type, object_id)
serialized = serialize_element_list(history)
HISTORY_CACHE[identifier] = serialized
return deserialize_element_list(HISTORY_CACHE[identifier])
def obtain_changeset_listing():
"""
# obtain https://github.com/matkoniecz/StreetComplete_usage_changeset_analysis/blob/master/streetcomplete_edits_generate_csv_and_make_quest_summary.php
# obtain https://github.com/matkoniecz/StreetComplete_usage_changeset_analysis/blob/master/extracting_data_from_xml_line.php
# check hashes
# or just check in it?
# with verification for actual source?
# find code to download changesets via torrent in python
# is in 1_obtain_osm_data.py
if os.path.isdir(config.changeset_list_download_folder()) == False:
os.makedirs(config.changeset_list_download_folder())
torrent_file_url = 'https://planet.osm.org/planet/changesets-latest.osm.bz2.torrent'
torrent_file_name = 'changesets-latest.osm.bz2.torrent'
torrent_file_directory = config.changeset_list_download_folder()
file_download_folder = config.changeset_list_download_folder()
obtain_osm_data.download_file_via_torrent(torrent_file_url, torrent_file_directory, torrent_file_name, file_download_folder)
# based on https://stackoverflow.com/a/16964073/4130619
# https://docs.python.org/3/library/bz2.html has no such example
# TODO done?
# it unpacks to something like changesets-250324.osm.bz2
# so ideally uncompressed would be also like this
"""
# remove commenting out above
# remove that line below
file_download_folder = config.changeset_list_download_folder()
command = 'php changeset_parser_streetcomplete_edits_generate_csv_and_make_quest_summary.php "' + decompressed_filepath + '" "' + csv_with_streetcomplete_changesets() + '"'
print(command)
#remove up to here
file_with_bz2_changesets = None
for file_name in os.listdir(file_download_folder):
target = os.path.join(file_download_folder, file_name)
if os.path.isfile(target):
if target.endswith(".bz2"):
if file_with_bz2_changesets == None or file_with_bz2_changesets < file_name:
# relevant if multiple files become unpacked over time
# in such case we want latest one
file_with_bz2_changesets = file_name
print("will unpack", file_with_bz2_changesets)
filepath = os.path.join(file_download_folder, file_with_bz2_changesets)
decompressed_filepath = os.path.join(file_download_folder, file_with_bz2_changesets.replace('.bz2', ''))
print("unpacking to", decompressed_filepath)
with open(decompressed_filepath, 'wb') as new_file, bz2.BZ2File(filepath, 'rb') as file:
for data in iter(lambda : file.read(100 * 1024), b''):
new_file.write(data)
"""
import bz2
filepath = '/media/mateusz/OSM_cache/ATP_matcher_cache/openstreetmap_all_changeset_data/changesets-250324.osm.bz2'
newfilepath = '/media/mateusz/OSM_cache/ATP_matcher_cache/openstreetmap_all_changeset_data/changesets-250324.osm'
with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as file:
decompressor = BZ2Decompressor()
for data in iter(lambda : file.read(100 * 1024), b''):
new_file.write(decompressor.decompress(data))
with open(newfilepath, 'wb') as new_file, bz2.BZ2File(filepath, 'rb') as file:
for data in iter(lambda : file.read(100 * 1024), b''):
new_file.write(data)"""
# yay, now we also got PHP as dependency
# TODO https://github.com/matkoniecz/StreetComplete_usage_changeset_analysis/blob/master/streetcomplete_edits_generate_csv_and_make_quest_summary.php - modify it so output.csv is also specified via parameter
# /home/mateusz/Documents/install_moje/OSM_software/StreetComplete_usage_changeset_analysis
# decide on where output goes
# then pull this code into my repo
command = 'php changeset_parser_streetcomplete_edits_generate_csv_and_make_quest_summary.php "' + decompressed_filepath + '" "' + csv_with_streetcomplete_changesets() + '"'
print(command)
os.system(command)
def csv_with_streetcomplete_changesets():
return os.path.join(config.changeset_list_download_folder(), 'streetcomplete_changesets.csv')
def main():
input_file = csv_with_streetcomplete_changesets()
if os.path.isfile(input_file) == False:
obtain_changeset_listing()
filtered_filename = os.path.join(config.changeset_list_download_folder(), 'streetcomplete_changesets_only_opening_hours.csv')
filtering_success_filepath = '/media/mateusz/OSM_cache/changesets/filtering_marker.success'
if os.path.isfile(filtering_success_filepath) == False:
create_filtered_csv(input_file, filtered_filename)
with open(filtering_success_filepath, "w") as myfile:
myfile.write("data prepared")
# Use a deque with maxlen set to store at most such number of lines
recent_changesets = deque(maxlen=100_000)
with open(filtered_filename, mode='r', newline='', encoding='utf-8') as outfile:
reader = csv.reader(outfile)
next(reader) # Skip header
for line in reader:
recent_changesets.append(line)
my_changesets = []
other_changesets = []
my_user_id = "1722488"
for line in recent_changesets:
changeset_id = line[0]
editor_id = line[1]
user_id = line[2]
if user_id == my_user_id:
my_changesets.append(changeset_id)
else:
other_changesets.append(changeset_id)
api = Api(url='https://openstreetmap.org')
#rich.print(history_info(api, "Node", 1))
# sqlite_test - that should be in serializing, probably TODO
connection = sqlite3.connect(sqlite_test.database_filepath())
cursor = connection.cursor()
#sqlite_test.show_content_sample(cursor)
print(sqlite_test.database_filepath())
sqlite_test.load_data_if_database_is_empty(cursor)
for changeset_id in my_changesets + other_changesets:
for element in elements_edited_by_changeset(api, changeset_id):
if "opening_hours" in element.tags:
if element.tags.get("opening_hours:signed") != "no":
osm_url = "https://www.openstreetmap.org/" + element.__class__.__name__.lower() + "/" + str(element.id)
cursor.execute("SELECT * FROM match_data WHERE osm_link = :osm_link ORDER BY match_distance ASC LIMIT 1000", {'osm_link': osm_url})
returned = cursor.fetchall()
if len(returned) == 0:
pass
elif len(returned) != 1:
print("found", len(returned), "matches in database")
for entry in returned:
parsed = serializing.Match.data_from_database_constructor(entry)
rich.print(parsed)
rich.print(parsed.atp_tags)
rich.print(parsed.osm_link)
else:
entry = returned[0]
parsed = serializing.Match.data_from_database_constructor(entry)
osm_opening_hours = element.tags["opening_hours"]
atp_opening_hours = parsed.atp_tags.get(config.opening_hours_key())
if osm_opening_hours == atp_opening_hours or atp_opening_hours == None:
print()
print()
print(osm_url)
rich.print("OSM", osm_opening_hours)
rich.print("ATP", atp_opening_hours)
else:
print()
print()
print(changeset_id)
rich.print(element)
print(osm_url)
rich.print("OSM", osm_opening_hours)
rich.print("ATP", atp_opening_hours)
if __name__ == "__main__":
main()

View file

@ -11,3 +11,4 @@ python-dotenv
libtorrent
regex
simple_cache
osm_easy_api

View file

@ -2,6 +2,7 @@ import base64
import json
import csv
import shared
import rich
class Match:
def __init__(self, atp_center, atp_tags, osm_match_center, osm_match_tags, osm_link, match_distance, all_very_good_matches):
@ -22,6 +23,73 @@ class Match:
def link_to_point_in_osm(self):
return shared.link_to_point_in_osm(self.osm_match_center['lat'], self.osm_match_center['lon'])
def insert_into_sqlite_database(self, cursor):
osm_match_center_lat = None
osm_match_center_lon = None
if self.osm_match_center != None:
osm_match_center_lat = self.osm_match_center['lat']
osm_match_center_lon = self.osm_match_center['lon']
cursor.execute("INSERT INTO match_data VALUES (:atp_center_lat, :atp_center_lon, :atp_tags, :osm_match_center_lat, :osm_match_center_lon, :osm_match_tags, :osm_link, :match_distance, :all_very_good_matches)",
{
"atp_center_lat": self.atp_center['lat'],
"atp_center_lon": self.atp_center['lon'],
"atp_tags": json.dumps(self.atp_tags),
"osm_match_center_lat": osm_match_center_lat,
"osm_match_center_lon": osm_match_center_lon,
"osm_match_tags": json.dumps(self.osm_match_tags),
"osm_link": self.osm_link,
"match_distance": self.match_distance,
"all_very_good_matches": json.dumps(self.all_very_good_matches),
}
)
@staticmethod
def data_from_database_constructor(data):
atp_center_lat=data[0]
atp_center_lon=data[1]
atp_center = {'lat': atp_center_lat, 'lon': atp_center_lon}
atp_tags=json.loads(data[2])
osm_match_center_lat=data[3]
osm_match_center_lon=data[4]
osm_match_center = {'lat': osm_match_center_lat, 'lon': osm_match_center_lon}
osm_match_tags=json.loads(data[5])
osm_link=data[6]
match_distance=data[7]
all_very_good_matches=json.loads(data[8])
return Match(atp_center, atp_tags, osm_match_center, osm_match_tags, osm_link, match_distance, all_very_good_matches)
@staticmethod
def create_table_if_needed(cursor):
if "match_data" in Match.existing_tables(cursor):
print("osm_data table exists already, delete file with database to recreate")
else:
"""
self.atp_center = atp_center
self.atp_tags = atp_tags
self.osm_match_center = osm_match_center
self.osm_match_tags = osm_match_tags
self.osm_link = osm_link
self.match_distance = match_distance
self.all_very_good_matches = all_very_good_matches
"""
cursor.execute('''CREATE TABLE match_data
(atp_center_lat float, atp_center_lon float, atp_tags text, osm_match_center_lat float, osm_match_center_lon float, osm_match_tags text, osm_link text, match_distance float, all_very_good_matches text)''')
# magnificent speedup
#cursor.execute("""CREATE INDEX idx_osm_data_area_identifier ON osm_data (area_identifier);""")
#cursor.execute("""CREATE INDEX idx_osm_data_id_type ON osm_data (id, type);""")
#cursor.execute("""CREATE INDEX idx_error_id ON osm_data (error_id);""")
#cursor.execute("""CREATE INDEX idx_download_timestamp ON osm_data (download_timestamp);""")
@staticmethod
def existing_tables(cursor):
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_listing = cursor.fetchall()
returned = []
for entry in table_listing:
returned.append(entry[0])
return returned
def save_list_of_matches_to_csv(filepath, data):
with open(filepath, 'w', newline='') as f:
writer = csv.writer(f)

48
sqlite_test.py Normal file
View file

@ -0,0 +1,48 @@
import sqlite3
import serializing
import data_iterator
graticule_report = __import__("5_generate_graticule_reports")
def load_data_if_database_is_empty(cursor):
serializing.Match.create_table_if_needed(cursor)
# TODO: load from entire area, not only miniscule data from Kraków (change also database_filepath()) - maybe change both now to take from one function?
# TODO: add indexes (search 'magnificent speedup')
cursor.execute("SELECT * FROM match_data ORDER BY match_distance ASC LIMIT 1000")
returned = cursor.fetchall()
print(len(returned), "in database")
if len(returned) == 0:
print("database is empty, inserting entries")
#dummy = serializing.Match(atp_center={'lat': 0, 'lon': 0}, atp_tags={'a': 'b'}, osm_match_center={'lat': 0, 'lon': 0}, osm_match_tags={'v': 'd'}, osm_link='https://osm/org', match_distance=10, all_very_good_matches="all_very_good_matches")
#dummy.insert_into_sqlite_database(cursor)
area = {'min_lat': 50, 'min_lon': 20, 'max_lat': 51, 'max_lon': 21} # Kraków
for entry in data_iterator.iterate_over_all_matches(area):
entry.insert_into_sqlite_database(cursor)
def show_content_sample(cursor):
cursor.execute(
"SELECT * FROM match_data WHERE match_distance <= :max_allowed_distance ORDER BY match_distance DESC LIMIT 1000",
{"max_allowed_distance": 100})
returned = cursor.fetchall()
print(len(returned), "in database")
for entry in returned:
print(entry)
if len(returned) == 0:
print("empty :(")
def main():
connection = sqlite3.connect(database_filepath())
cursor = connection.cursor()
print(database_filepath())
load_data_if_database_is_empty(cursor)
show_content_sample(cursor)
connection.commit()
connection.close()
def database_filepath():
area = graticule_report.global_graticule_coverage()
area = {'min_lat': 50, 'min_lon': 20, 'max_lat': 51, 'max_lon': 21} # Kraków
return graticule_report.graticule_cache(area) + "test_database.db"
if __name__ == "__main__":
main()